public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
@ 2022-06-07 20:06 Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
                   ` (8 more replies)
  0 siblings, 9 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, Andrey.Kolesov

Improvementss are:
    1. Reduce code size (-64 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata size ([-128, -188] bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

        Function, New Time, Old Time, New / Old
_ZGVeN16v_atanhf,     1.39,    1.408,     0.987
---
 .../multiarch/svml_s_atanhf16_core_avx512.S   | 467 +++++++++---------
 1 file changed, 237 insertions(+), 230 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index a1cd920a0f..4f0f13726e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -31,53 +31,50 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512
- */
-#define Log_tbl_H			0
-#define Log_tbl_L			128
-#define One				256
-#define AbsMask				320
-#define AddB5				384
-#define RcpBitMask			448
-#define poly_coeff3			512
-#define poly_coeff2			576
-#define poly_coeff1			640
-#define poly_coeff0			704
-#define Half				768
-#define L2H				832
-#define L2L				896
+/* Offsets for data table __svml_satanh_data_internal_avx512 and
+   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
+   function. On cold-starts this might help the prefetcher. Possibly
+   a better idea is to interleave start/end so that the prefetcher is
+   less likely to detect a stream and pull irrelivant lines into
+   cache.  */
+
+/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
+   the memory is broadcast to {1to16}.  */
+#define AbsMask				0
+
+/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
+   is used here.  */
+#define One				0
+#define AddB5				64
+#define RcpBitMask			128
+#define Log_tbl_L_lo			192
+#define Log_tbl_L_hi			256
+#define Log_tbl_H_lo			320
+#define Log_tbl_H_hi			384
+#define L2H				448
+#define L2L				512
+#define poly_coeff3			576
+#define poly_coeff2			640
+#define poly_coeff1			704
 
 #include <sysdep.h>
 
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal_avx512_al64)
+
 	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanhf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	vmovups	One+__svml_satanh_data_internal_avx512(%rip), %zmm4
-
-	/* round reciprocals to 1+5b mantissas */
-	vmovups	AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
-	vmovups	RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
-	vmovaps	%zmm0, %zmm11
-	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
+	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
+	vmovups	ATANHF_DATA(One)(%rip), %zmm4
 
 	/* 1+y */
 	vaddps	{rn-sae}, %zmm4, %zmm6, %zmm9
 
 	/* 1-y */
 	vsubps	{rn-sae}, %zmm6, %zmm4, %zmm8
-	vxorps	%zmm6, %zmm11, %zmm10
-
-	/* Yp_high */
-	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
 
-	/* -Ym_high */
-	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+	/* round reciprocals to 1+5b mantissas */
+	vmovups	ATANHF_DATA(AddB5)(%rip), %zmm14
+	vmovups	ATANHF_DATA(RcpBitMask)(%rip), %zmm1
 
 	/* RcpP ~ 1/Yp */
 	vrcp14ps %zmm9, %zmm12
@@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	/* RcpM ~ 1/Ym */
 	vrcp14ps %zmm8, %zmm13
 
+	/* Yp_high */
+	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
+
+	/* -Ym_high */
+	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+
+
 	/* input outside (-1, 1) ? */
-	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
 	vpaddd	%zmm14, %zmm12, %zmm15
-	vpaddd	%zmm14, %zmm13, %zmm0
+	vpaddd	%zmm14, %zmm13, %zmm12
 
 	/* Yp_low */
 	vsubps	{rn-sae}, %zmm2, %zmm6, %zmm3
 	vandps	%zmm1, %zmm15, %zmm7
-	vandps	%zmm1, %zmm0, %zmm12
+	vandps	%zmm1, %zmm12, %zmm12
 
 	/* Ym_low */
 	vaddps	{rn-sae}, %zmm5, %zmm6, %zmm5
@@ -102,225 +105,192 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
 
 	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
-	vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
-	vmovups	Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
-	vmovups	Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
+
+	vmovups	ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
+	vmovups	ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
 
 	/* exponents */
-	vgetexpps {sae}, %zmm7, %zmm15
 	vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
+	vgetexpps {sae}, %zmm7, %zmm15
+
 
 	/* Table lookups */
-	vmovups	__svml_satanh_data_internal_avx512(%rip), %zmm6
+	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
 	vgetexpps {sae}, %zmm12, %zmm14
-	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
+
 
 	/* Prepare table index */
 	vpsrld	$18, %zmm7, %zmm3
 	vpsrld	$18, %zmm12, %zmm2
-	vmovups	Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
-
+	vmovups	ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
+	vmovups	ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
 	/* Km-Kp */
+
+	vmovaps	%zmm3, %zmm5
+	vpermi2ps %zmm13, %zmm10, %zmm3
+	vpermt2ps %zmm13, %zmm2, %zmm10
+	vpermi2ps %zmm7, %zmm11, %zmm5
+	vpermt2ps %zmm7, %zmm2, %zmm11
 	vsubps	{rn-sae}, %zmm15, %zmm14, %zmm1
-	kmovw	%k0, %edx
-	vmovaps	%zmm3, %zmm0
-	vpermi2ps %zmm13, %zmm8, %zmm3
-	vpermt2ps %zmm13, %zmm2, %zmm8
-	vpermi2ps %zmm7, %zmm6, %zmm0
-	vpermt2ps %zmm7, %zmm2, %zmm6
-	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm5
+	vsubps	{rn-sae}, %zmm3, %zmm10, %zmm7
 
 	/* K*L2H + Th */
-	vmovups	L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
+	vmovups	ATANHF_DATA(L2H)(%rip), %zmm2
 
 	/* K*L2L + Tl */
-	vmovups	L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
-
-	/* polynomials */
-	vmovups	poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vmovups	ATANHF_DATA(L2L)(%rip), %zmm3
 
 	/* table values */
-	vsubps	{rn-sae}, %zmm0, %zmm6, %zmm0
-	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
-	vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
-	vmovups	poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
-	vmovaps	%zmm3, %zmm2
-	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
-	vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
-	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
-	vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
+	vsubps	{rn-sae}, %zmm5, %zmm11, %zmm5
+	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
+	vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
+	/* polynomials */
+	vmovups	ATANHF_DATA(poly_coeff3)(%rip), %zmm7
+	vmovups	ATANHF_DATA(poly_coeff2)(%rip), %zmm10
+	vmovaps	%zmm10, %zmm14
+	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
+	vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
+	vmovups	ATANHF_DATA(poly_coeff1)(%rip), %zmm12
+	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
+	vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
 
 	/* (K*L2L + Tl) + Rp*PolyP */
-	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
-	vorps	Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
+	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
+
+	/* zmm12 = zmm12 & (zmm4 | zmm0).  */
+	vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
 
 	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
-	vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
-	vaddps	{rn-sae}, %zmm3, %zmm0, %zmm4
-	vmulps	{rn-sae}, %zmm9, %zmm4, %zmm0
+	vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
+	vaddps	{rn-sae}, %zmm14, %zmm10, %zmm8
+
+	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
+	kmovw	%k0, %edx
 	testl	%edx, %edx
 
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
+	# LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm0
 
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	/* No register to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   moreso than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm11, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_def_cfa(rsp, 16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_def_cfa(rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa(rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa(r13, 32)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm1
+	vmovaps	%zmm1, (%rsp)
+	vmovaps	%zmm0, 64(%rsp)
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
-
-	movss	%xmm0, 128(%rsp, %r14, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa(rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa(rsp, 24)
+	popq	%rbx
+	cfi_def_cfa(rsp, 16)
+	popq	%r13
+	ret
 END(_ZGVeN16v_atanhf_skx)
 
 	.section .rodata, "a"
-	.align	64
-
+	.align	4
 #ifdef __svml_satanh_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 Log_tbl_H[32][1];
-	__declspec(align(64)) VUINT32 Log_tbl_L[32][1];
+typedef struct{
+	__declspec(align(4)) VUINT32 AbsMask[1][1];
 	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 AbsMask[16][1];
 	__declspec(align(64)) VUINT32 AddB5[16][1];
 	__declspec(align(64)) VUINT32 RcpBitMask[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
+	__declspec(align(64)) VUINT32 L2H[16][1];
+	__declspec(align(64)) VUINT32 L2L[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff3[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff2[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff1[16][1];
-	__declspec(align(64)) VUINT32 poly_coeff0[16][1];
-	__declspec(align(64)) VUINT32 Half[16][1];
-	__declspec(align(64)) VUINT32 L2H[16][1];
-	__declspec(align(64)) VUINT32 L2L[16][1];
 } __svml_satanh_data_internal_avx512;
 #endif
 __svml_satanh_data_internal_avx512:
-	/* Log_tbl_H */
-	.long	0x00000000
-	.long	0x3cfc0000
-	.long	0x3d780000
-	.long	0x3db78000
-	.long	0x3df10000
-	.long	0x3e14c000
-	.long	0x3e300000
-	.long	0x3e4a8000
-	.long	0x3e648000
-	.long	0x3e7dc000
-	.long	0x3e8b4000
-	.long	0x3e974000
-	.long	0x3ea30000
-	.long	0x3eae8000
-	.long	0x3eb9c000
-	.long	0x3ec4e000
-	.long	0x3ecfa000
-	.long	0x3eda2000
-	.long	0x3ee48000
-	.long	0x3eeea000
-	.long	0x3ef8a000
-	.long	0x3f013000
-	.long	0x3f05f000
-	.long	0x3f0aa000
-	.long	0x3f0f4000
-	.long	0x3f13d000
-	.long	0x3f184000
-	.long	0x3f1ca000
-	.long	0x3f20f000
-	.long	0x3f252000
-	.long	0x3f295000
-	.long	0x3f2d7000
-	/* Log_tbl_L */
+	/* Leave this at front so we can potentially save space due to
+	   smaller alignment constraint.  */
+	.align	4
+    /* AbsMask */
+	.long	0x7fffffff
+	.align	64
+__svml_satanh_data_internal_avx512_al64:
+	/* One */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* AddB5 */
+	.align	64
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	/* RcpBitMask */
+	.align	64
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	/* Log_tbl_L_lo */
 	.align	64
 	.long	0x00000000
 	.long	0x3726c39e
@@ -338,6 +308,8 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38dedfac
 	.long	0x38ebfb5e
 	.long	0xb8e63c9f
+	/* Log_tbl_L_hi */
+	.align	64
 	.long	0xb85c1340
 	.long	0x38777bcd
 	.long	0xb6038656
@@ -354,39 +326,74 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38f85db0
 	.long	0x37b4996f
 	.long	0xb8bfb3ca
-	/* One */
+	/* Log_tbl_H_lo */
 	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* AbsMask */
+	.long	0x00000000
+	.long	0x3cfc0000
+	.long	0x3d780000
+	.long	0x3db78000
+	.long	0x3df10000
+	.long	0x3e14c000
+	.long	0x3e300000
+	.long	0x3e4a8000
+	.long	0x3e648000
+	.long	0x3e7dc000
+	.long	0x3e8b4000
+	.long	0x3e974000
+	.long	0x3ea30000
+	.long	0x3eae8000
+	.long	0x3eb9c000
+	.long	0x3ec4e000
+	/* Log_tbl_H_hi */
 	.align	64
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* AddB5 */
+	.long	0x3ecfa000
+	.long	0x3eda2000
+	.long	0x3ee48000
+	.long	0x3eeea000
+	.long	0x3ef8a000
+	.long	0x3f013000
+	.long	0x3f05f000
+	.long	0x3f0aa000
+	.long	0x3f0f4000
+	.long	0x3f13d000
+	.long	0x3f184000
+	.long	0x3f1ca000
+	.long	0x3f20f000
+	.long	0x3f252000
+	.long	0x3f295000
+	.long	0x3f2d7000
+	/* L2H = log(2)_high */
 	.align	64
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
-	/* RcpBitMask */
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	/* L2L = log(2)_low */
 	.align	64
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
 	/* poly_coeff3 */
 	.align	64
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
 	/* poly_coeff2 */
 	.align	64
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
 	/* poly_coeff1 */
 	.align	64
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	/* poly_coeff0 */
-	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* Half */
-	.align	64
-	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
-	/* L2H = log(2)_high */
-	.align	64
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	/* L2L = log(2)_low */
-	.align	64
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
 	.align	64
+	.type	__svml_satanh_data_internal_avx512_al64, @object
+	.size	__svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
 	.type	__svml_satanh_data_internal_avx512, @object
 	.size	__svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v1 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
@ 2022-06-07 20:06 ` Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, Andrey.Kolesov

Improvements are:
    1. Reduce code size (-60 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Shrink rodata usage (-32 bytes).

The throughput improvement is not that significant (3-5%) as the
port 0 bottleneck is unavoidable.

       Function, New Time, Old Time, New / Old
_ZGVdN8v_atanhf,    2.799,    2.923,     0.958
---
 .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 398 +++++++++---------
 1 file changed, 195 insertions(+), 203 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index c1ea1c3353..911d2300b0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -30,305 +30,297 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
+/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might hhelp the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
 #define SgnMask				0
 #define sOne				32
-#define sPoly				64
-#define iBrkValue			320
-#define iOffExpoMask			352
-#define sHalf				384
-#define sSign				416
-#define sTopMask12			448
-#define TinyRange			480
-#define sLn2				512
+#define sTopMask12			64
+#define TinyRange			96
+#define iBrkValue			128
+#define iOffExpoMask			160
+#define sPoly				192
+#define sLn2				448
+#define sHalf				480
 
 #include <sysdep.h>
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_atanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	subq	$96, %rsp
-
+	/* Strip off the sign, so treat X as positive until right at the end */
+	vmovaps	ATANHF_DATA(SgnMask)(%rip), %ymm2
+	vandps	%ymm2, %ymm0, %ymm3
 	/* Load constants including One = 1 */
-	vmovups	sOne+__svml_satanh_data_internal(%rip), %ymm5
-	vmovups	sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
-	vmovaps	%ymm0, %ymm6
+	vmovups	ATANHF_DATA(sOne)(%rip), %ymm5
+	vsubps	%ymm3, %ymm5, %ymm1
+	vmovups	ATANHF_DATA(sTopMask12)(%rip), %ymm4
 
-	/* Strip off the sign, so treat X as positive until right at the end */
-	vandps	SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
-	vsubps	%ymm10, %ymm5, %ymm1
+	vrcpps	%ymm1, %ymm7
+	vsubps	%ymm1, %ymm5, %ymm9
+	vandps	%ymm4, %ymm7, %ymm6
+	vsubps	%ymm3, %ymm9, %ymm7
 
-	/*
-	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
-	 * the upper part UHi being <= 12 bits long. Then we have
-	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
-	 */
-	vaddps	%ymm10, %ymm10, %ymm14
+	/* No need to split sU when FMA is available */
+	vfnmadd213ps %ymm5, %ymm6, %ymm1
+	vmovaps	%ymm0, %ymm8
+	vfmadd213ps %ymm0, %ymm0, %ymm0
+	vfnmadd231ps %ymm6, %ymm7, %ymm1
 
 	/*
 	 * Check whether |X| < 1, in which case we use the main function.
 	 * Otherwise set the rangemask so that the callout will get used.
 	 * Note that this will also use the callout for NaNs since not(NaN < 1).
 	 */
-	vcmpnlt_uqps %ymm5, %ymm10, %ymm7
-	vsubps	%ymm1, %ymm5, %ymm9
-	vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
-	vrcpps	%ymm1, %ymm11
-	vsubps	%ymm10, %ymm9, %ymm12
-	vandps	%ymm13, %ymm11, %ymm0
+	vcmpnlt_uqps %ymm5, %ymm3, %ymm14
+	vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
 
-	/* No need to split sU when FMA is available */
-	vfnmadd213ps %ymm5, %ymm0, %ymm1
-	vmovaps	%ymm6, %ymm8
-	vfmadd213ps %ymm6, %ymm6, %ymm8
-	vfnmadd231ps %ymm0, %ymm12, %ymm1
+	/*
+	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+	 * the upper part UHi being <= 12 bits long. Then we have
+	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
+	 */
+	vaddps	%ymm3, %ymm3, %ymm3
 
 	/*
 	 * Split V as well into upper 12 bits and lower part, so that we can get
 	 * a preliminary quotient estimate without rounding error.
 	 */
-	vandps	%ymm13, %ymm14, %ymm15
-	vmovmskps %ymm7, %edx
-	vsubps	%ymm15, %ymm14, %ymm7
+	vandps	%ymm4, %ymm3, %ymm4
+	vsubps	%ymm4, %ymm3, %ymm7
 
 	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-	vmulps	%ymm15, %ymm0, %ymm10
+	vmulps	%ymm4, %ymm6, %ymm4
 
 	/* Compute D = E + E^2 */
 	vfmadd213ps %ymm1, %ymm1, %ymm1
 
-	/* Record the sign for eventual reincorporation. */
-	vandps	sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
+	/* Record the sign for eventual reincorporation.  */
+	vandnps	%ymm8, %ymm2, %ymm3
 
 	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-	vorps	%ymm3, %ymm8, %ymm2
-	vmulps	%ymm7, %ymm0, %ymm8
+	vorps	%ymm3, %ymm0, %ymm13
+	vmulps	%ymm7, %ymm6, %ymm2
 
 	/*
 	 * Compute R * (VHi + VLo) * (1 + E + E^2)
 	 * = R *  (VHi + VLo) * (1 + D)
 	 * = QHi + (QHi * D + QLo + QLo * D)
 	 */
-	vmulps	%ymm1, %ymm10, %ymm9
-	vfmadd213ps %ymm8, %ymm8, %ymm1
-	vaddps	%ymm1, %ymm9, %ymm1
 
-	/* reduction: compute r, n */
-	vmovups	iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
+	/*
+	 * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
+	 * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
+	 * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
+	 */
+	vmulps	%ymm1, %ymm4, %ymm6
+	vfmadd213ps %ymm2, %ymm2, %ymm1
+	vaddps	%ymm1, %ymm6, %ymm1
 
 	/*
 	 * Now finally accumulate the high and low parts of the
 	 * argument to log1p, H + L, with a final compensated summation.
 	 */
-	vaddps	%ymm1, %ymm10, %ymm12
-	vsubps	%ymm12, %ymm10, %ymm11
+	vaddps	%ymm1, %ymm4, %ymm2
+
+	/* reduction: compute r, n */
+	vmovups	ATANHF_DATA(iBrkValue)(%rip), %ymm9
 
 	/*
 	 * Now we feed into the log1p code, using H in place of _VARG1 and
 	 * later incorporating L into the reduced argument.
 	 * compute 1+x as high, low parts
 	 */
-	vmaxps	%ymm12, %ymm5, %ymm13
-	vminps	%ymm12, %ymm5, %ymm14
-	vaddps	%ymm11, %ymm1, %ymm0
-	vaddps	%ymm14, %ymm13, %ymm1
-	vpsubd	%ymm9, %ymm1, %ymm7
-	vsubps	%ymm1, %ymm13, %ymm15
-	vpsrad	$23, %ymm7, %ymm10
-	vpand	iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
-	vaddps	%ymm15, %ymm14, %ymm13
-	vpslld	$23, %ymm10, %ymm11
-	vpaddd	%ymm9, %ymm8, %ymm15
-	vaddps	%ymm13, %ymm0, %ymm14
-	vcvtdq2ps %ymm10, %ymm0
-	vpsubd	%ymm11, %ymm5, %ymm12
+	vmaxps	%ymm2, %ymm5, %ymm0
+	vminps	%ymm2, %ymm5, %ymm6
+
+	/* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
+	vsubps	%ymm2, %ymm4, %ymm2
+	vaddps	%ymm6, %ymm0, %ymm4
+	vpsubd	%ymm9, %ymm4, %ymm7
+	vsubps	%ymm4, %ymm0, %ymm4
+	vaddps	%ymm2, %ymm1, %ymm2
+	vmovaps	ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
+
+	vandps	%ymm1, %ymm7, %ymm0
+	vaddps	%ymm4, %ymm6, %ymm4
+	vandnps	%ymm7, %ymm1, %ymm6
+	vmovups	ATANHF_DATA(sPoly+0)(%rip), %ymm1
+	vpaddd	%ymm9, %ymm0, %ymm0
+	vaddps	%ymm4, %ymm2, %ymm4
+	vpsubd	%ymm6, %ymm5, %ymm6
 
 	/* polynomial evaluation */
-	vsubps	%ymm5, %ymm15, %ymm5
-	vmulps	%ymm14, %ymm12, %ymm1
-	vaddps	%ymm5, %ymm1, %ymm5
-	vmovups	sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
-	vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vmulps	%ymm1, %ymm5, %ymm7
-	vfmadd213ps %ymm5, %ymm5, %ymm7
+	vsubps	%ymm5, %ymm0, %ymm2
+	vfmadd231ps %ymm4, %ymm6, %ymm2
+	vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
+
+	vmulps	%ymm1, %ymm2, %ymm1
+	vfmadd213ps %ymm2, %ymm2, %ymm1
 
 	/* final reconstruction */
-	vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
+	vpsrad	$23, %ymm7, %ymm6
+	vcvtdq2ps %ymm6, %ymm2
+	vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
 
 	/* Finally, halve the result and reincorporate the sign */
-	vxorps	sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
-	vmulps	%ymm0, %ymm3, %ymm0
-	vblendvps %ymm4, %ymm2, %ymm0, %ymm0
+	vxorps	ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
+	vmulps	%ymm2, %ymm3, %ymm2
+	vmovmskps %ymm14, %edx
 	testl	%edx, %edx
 
+	vblendvps %ymm15, %ymm13, %ymm2, %ymm0
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
-
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	# LOE rbx rdx r12 r13 r14 r15 ymm0
+	/* No registers to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   moreso than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm6, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx ymm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_def_cfa(rsp, 16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_def_cfa(rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa(rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa(r13, 32)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
+	/* Save all already computed inputs.  */
+	vmovups	%ymm0, (%rsp)
+	/* Save origional input (ymm8 unchanged up to this point).  */
+	vmovups	%ymm8, 32(%rsp)
 
-	/* Special inputs
-	 * processing loop
-	 */
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* All results have been written to 32(%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa(rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa(rsp, 24)
+	popq	%rbx
+	cfi_def_cfa(rsp, 16)
+	popq	%r13
+	ret
 END(_ZGVdN8v_atanhf_avx2)
 
 	.section .rodata, "a"
 	.align	32
-
 #ifdef __svml_satanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
+typedef struct{
 	__declspec(align(32)) VUINT32 SgnMask[8][1];
 	__declspec(align(32)) VUINT32 sOne[8][1];
-	__declspec(align(32)) VUINT32 sPoly[8][8][1];
-	__declspec(align(32)) VUINT32 iBrkValue[8][1];
-	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-	__declspec(align(32)) VUINT32 sHalf[8][1];
-	__declspec(align(32)) VUINT32 sSign[8][1];
 	__declspec(align(32)) VUINT32 sTopMask12[8][1];
 	__declspec(align(32)) VUINT32 TinyRange[8][1];
+	__declspec(align(32)) VUINT32 iBrkValue[8][1];
+	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
+	__declspec(align(32)) VUINT32 sPoly[8][8][1];
 	__declspec(align(32)) VUINT32 sLn2[8][1];
+	__declspec(align(32)) VUINT32 sHalf[8][1];
 } __svml_satanh_data_internal;
 #endif
 __svml_satanh_data_internal:
 	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 	/* sOne = SP 1.0 */
 	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sPoly[] = SP polynomial */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* sTopMask12 */
+	.align	32
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	/* TinyRange */
 	.align	32
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
 	/* iBrkValue = SP 2/3 */
 	.align	32
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 	/* iOffExpoMask = SP significand mask */
 	.align	32
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sHalf */
-	.align	32
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	/* sSign */
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
-	/* sTopMask12 */
-	.align	32
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	/* TinyRange */
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	/* sPoly[] = SP polynomial */
 	.align	32
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
 	/* sLn2 = SP ln(2) */
 	.align	32
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	/* sHalf */
+	.align	32
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
 	.align	32
 	.type	__svml_satanh_data_internal, @object
 	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v1 3/7] x86: Improve svml_s_atanhf4_core_sse4.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
@ 2022-06-07 20:06 ` Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, Andrey.Kolesov

Improvements are:
    1. Reduce code size (-62 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata usage (-16 bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

       Function, New Time, Old Time, New / Old
_ZGVbN4v_atanhf,    8.821,    8.903,     0.991
---
 .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 377 ++++++++----------
 1 file changed, 168 insertions(+), 209 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
index 2d3ad2617f..baed047a1f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
@@ -30,96 +30,80 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
-#define SgnMask				0
-#define sOne				16
-#define sPoly				32
-#define iBrkValue			160
-#define iOffExpoMask			176
-#define sHalf				192
-#define sSign				208
-#define sTopMask12			224
-#define TinyRange			240
-#define sLn2				256
+/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+#define sOne				0
+#define SgnMask				16
+#define sTopMask12			32
+#define iBrkValue			48
+#define iOffExpoMask			64
+#define sPoly				80
+#define sLn2				208
+#define TinyRange			224
 
 #include <sysdep.h>
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_atanhf_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
 	movaps	%xmm0, %xmm5
 
 	/* Load constants including One = 1 */
-	movups	sOne+__svml_satanh_data_internal(%rip), %xmm4
+	movups	ATANHF_DATA(sOne)(%rip), %xmm4
 	movaps	%xmm5, %xmm3
 
 	/* Strip off the sign, so treat X as positive until right at the end */
-	movups	SgnMask+__svml_satanh_data_internal(%rip), %xmm7
-	movaps	%xmm4, %xmm8
-	andps	%xmm5, %xmm7
+	movups	ATANHF_DATA(SgnMask)(%rip), %xmm1
+	movaps	%xmm4, %xmm2
+	andps	%xmm1, %xmm0
 	movaps	%xmm4, %xmm10
-	movups	sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
+	movups	ATANHF_DATA(sTopMask12)(%rip), %xmm11
 	movaps	%xmm4, %xmm14
 	movaps	%xmm11, %xmm9
 
+
 	/*
 	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
 	 * the upper part UHi being <= 12 bits long. Then we have
 	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
 	 */
-	movaps	%xmm7, %xmm12
+	movaps	%xmm0, %xmm6
+	mulps	%xmm5, %xmm3
+	subps	%xmm0, %xmm2
+	addps	%xmm0, %xmm6
+	subps	%xmm2, %xmm10
+	addps	%xmm5, %xmm3
+	subps	%xmm0, %xmm10
+	andps	%xmm2, %xmm9
+
 
 	/*
 	 * Check whether |X| < 1, in which case we use the main function.
 	 * Otherwise set the rangemask so that the callout will get used.
 	 * Note that this will also use the callout for NaNs since not(NaN < 1).
 	 */
-	movaps	%xmm7, %xmm6
-	movaps	%xmm7, %xmm2
-	cmpnltps %xmm4, %xmm6
-	cmpltps	TinyRange+__svml_satanh_data_internal(%rip), %xmm2
-	mulps	%xmm5, %xmm3
-	subps	%xmm7, %xmm8
-	addps	%xmm7, %xmm12
-	movmskps %xmm6, %edx
-	subps	%xmm8, %xmm10
-	addps	%xmm5, %xmm3
-	subps	%xmm7, %xmm10
-	andps	%xmm8, %xmm9
+	rcpps	%xmm9, %xmm7
+	subps	%xmm9, %xmm2
+	andps	%xmm11, %xmm7
 
-	/*
-	 * Now we feed into the log1p code, using H in place of _VARG1 and
-	 * later incorporating L into the reduced argument.
-	 * compute 1+x as high, low parts
-	 */
-	movaps	%xmm4, %xmm7
-
-	/*
-	 * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
-	 * The first FMR is exact (we force R to 12 bits just in case it
-	 * isn't already, to make absolutely sure), and since E is ~ 2^-12,
-	 * the rounding error in the other one is acceptable.
-	 */
-	rcpps	%xmm9, %xmm15
-	subps	%xmm9, %xmm8
-	andps	%xmm11, %xmm15
 
 	/*
 	 * Split V as well into upper 12 bits and lower part, so that we can get
 	 * a preliminary quotient estimate without rounding error.
 	 */
-	andps	%xmm12, %xmm11
-	mulps	%xmm15, %xmm9
-	addps	%xmm8, %xmm10
-	subps	%xmm11, %xmm12
+	andps	%xmm6, %xmm11
+	mulps	%xmm7, %xmm9
+	addps	%xmm2, %xmm10
+	subps	%xmm11, %xmm6
 
 	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-	mulps	%xmm15, %xmm11
-	mulps	%xmm15, %xmm10
+	mulps	%xmm7, %xmm11
+	mulps	%xmm7, %xmm10
 	subps	%xmm9, %xmm14
-	mulps	%xmm12, %xmm15
+	mulps	%xmm6, %xmm7
 	subps	%xmm10, %xmm14
 
 	/* Compute D = E + E^2 */
@@ -127,8 +111,8 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	movaps	%xmm4, %xmm8
 	mulps	%xmm14, %xmm13
 
-	/* reduction: compute r, n */
-	movdqu	iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
+	/* reduction: compute r,n */
+	movdqu	ATANHF_DATA(iBrkValue)(%rip), %xmm9
 	addps	%xmm13, %xmm14
 
 	/*
@@ -136,168 +120,148 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	 * = R *  (VHi + VLo) * (1 + D)
 	 * = QHi + (QHi * D + QLo + QLo * D)
 	 */
-	movaps	%xmm14, %xmm0
-	mulps	%xmm15, %xmm14
-	mulps	%xmm11, %xmm0
-	addps	%xmm14, %xmm15
-	movdqu	iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
+	movaps	%xmm14, %xmm2
+	mulps	%xmm7, %xmm14
+	mulps	%xmm11, %xmm2
+	addps	%xmm14, %xmm7
+	movdqu	ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
 	movaps	%xmm4, %xmm14
 
 	/* Record the sign for eventual reincorporation. */
-	movups	sSign+__svml_satanh_data_internal(%rip), %xmm1
-	addps	%xmm15, %xmm0
+	addps	%xmm7, %xmm2
+
 
 	/*
 	 * Now finally accumulate the high and low parts of the
 	 * argument to log1p, H + L, with a final compensated summation.
 	 */
-	movaps	%xmm0, %xmm6
-	andps	%xmm5, %xmm1
-
+	movaps	%xmm2, %xmm6
+	andnps	%xmm5, %xmm1
+	movaps	%xmm4, %xmm7
 	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-	orps	%xmm1, %xmm3
 	addps	%xmm11, %xmm6
 	maxps	%xmm6, %xmm7
 	minps	%xmm6, %xmm8
 	subps	%xmm6, %xmm11
 	movaps	%xmm7, %xmm10
-	andps	%xmm2, %xmm3
 	addps	%xmm8, %xmm10
-	addps	%xmm11, %xmm0
+	addps	%xmm11, %xmm2
 	subps	%xmm10, %xmm7
 	psubd	%xmm9, %xmm10
-	addps	%xmm7, %xmm8
+	addps	%xmm8, %xmm7
 	pand	%xmm10, %xmm12
 	psrad	$23, %xmm10
 	cvtdq2ps %xmm10, %xmm13
-	addps	%xmm8, %xmm0
+	addps	%xmm7, %xmm2
 
 	/* final reconstruction */
-	mulps	sLn2+__svml_satanh_data_internal(%rip), %xmm13
 	pslld	$23, %xmm10
 	paddd	%xmm9, %xmm12
 	psubd	%xmm10, %xmm14
 
 	/* polynomial evaluation */
 	subps	%xmm4, %xmm12
-	mulps	%xmm0, %xmm14
-	movups	sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
-	addps	%xmm12, %xmm14
-	mulps	%xmm14, %xmm0
+	mulps	%xmm14, %xmm2
+	movups	ATANHF_DATA(sPoly+0)(%rip), %xmm7
+	addps	%xmm12, %xmm2
+	mulps	%xmm2, %xmm7
+
 
 	/* Finally, halve the result and reincorporate the sign */
-	movups	sHalf+__svml_satanh_data_internal(%rip), %xmm4
-	pxor	%xmm1, %xmm4
-	addps	sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	mulps	%xmm14, %xmm0
-	addps	%xmm0, %xmm14
-	movaps	%xmm2, %xmm0
-	addps	%xmm13, %xmm14
-	mulps	%xmm14, %xmm4
-	andnps	%xmm4, %xmm0
-	orps	%xmm3, %xmm0
-	testl	%edx, %edx
+	addps	ATANHF_DATA(sPoly+16)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+32)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+48)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+64)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+80)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+96)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	movaps	ATANHF_DATA(sPoly+112)(%rip), %xmm6
+	addps	%xmm6, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	ATANHF_DATA(sLn2)(%rip), %xmm13
+	/* We can build `sHalf` with `sPoly & sOne`.  */
+	andps	%xmm4, %xmm6
+	orps	%xmm1, %xmm3
+	xorps	%xmm6, %xmm1
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
+	addps	%xmm2, %xmm7
+	addps	%xmm13, %xmm7
+	mulps	%xmm7, %xmm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	/* Finish check of NaNs.  */
+	cmpleps	%xmm0, %xmm4
+	movmskps %xmm4, %edx
+	cmpltps	ATANHF_DATA(TinyRange)(%rip), %xmm0
 
-L(EXIT):
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
+	andps	%xmm0, %xmm3
+	andnps	%xmm1, %xmm0
+	orps	%xmm3, %xmm0
+
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	/* No registers to restore on fast path.  */
 	ret
-	cfi_def_cfa_offset(80)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   moreso than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm5, 32(%rsp)
-	movups	%xmm0, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 edx
-
-	xorl	%eax, %eax
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
-
+	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+       call entry will be 16-byte aligned. */
+	subq	$56, %rsp
+
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm5, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+       encoding for many instructions (as compared with r12/r13). */
+	movq	%rbx, (%rsp)
+	cfi_offset(rbx, -16)
+	movq	%rbp, 8(%rsp)
+	cfi_offset(rbp, -8)
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
-
-	movss	%xmm0, 48(%rsp, %r14, 4)
-
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+	/* All results have been written to 16(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore(rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore(rbp)
+	addq	$56, %rsp
+	ret
 END(_ZGVbN4v_atanhf_sse4)
 
 	.section .rodata, "a"
@@ -305,56 +269,51 @@ END(_ZGVbN4v_atanhf_sse4)
 
 #ifdef __svml_satanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 SgnMask[4][1];
+typedef struct{
 	__declspec(align(16)) VUINT32 sOne[4][1];
-	__declspec(align(16)) VUINT32 sPoly[8][4][1];
+	__declspec(align(16)) VUINT32 SgnMask[4][1];
+	__declspec(align(16)) VUINT32 sTopMask12[4][1];
 	__declspec(align(16)) VUINT32 iBrkValue[4][1];
 	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-	__declspec(align(16)) VUINT32 sHalf[4][1];
-	__declspec(align(16)) VUINT32 sSign[4][1];
-	__declspec(align(16)) VUINT32 sTopMask12[4][1];
-	__declspec(align(16)) VUINT32 TinyRange[4][1];
+	__declspec(align(16)) VUINT32 sPoly[8][4][1];
 	__declspec(align(16)) VUINT32 sLn2[4][1];
+	__declspec(align(16)) VUINT32 TinyRange[4][1];
 } __svml_satanh_data_internal;
 #endif
+
 __svml_satanh_data_internal:
-	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 	/* sOne = SP 1.0 */
 	.align	16
 	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sPoly[] = SP polynomial */
+	/* SgnMask */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* sTopMask12 */
 	.align	16
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
 	/* iBrkValue = SP 2/3 */
 	.align	16
 	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
+	/* iOffExpoMask = SP significand mask ==*/
 	.align	16
 	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sHalf */
-	.align	16
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	/* sSign */
+
+	/* sPoly[] = SP polynomial */
 	.align	16
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	/* sTopMask12 */
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
+
+	/* sLn2 = SP ln(2) */
 	.align	16
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 	/* TinyRange */
 	.align	16
 	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-	/* sLn2 = SP ln(2) */
-	.align	16
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 	.align	16
 	.type	__svml_satanh_data_internal, @object
 	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v1 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
@ 2022-06-07 20:06 ` Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, Andrey.Kolesov

Optimizations are:
    1. Reduce code size (-67 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata usage (-448 bytes).

Result is roughly a 14% speedup:

       Function, New Time, Old Time, New / Old
_ZGVeN16v_tanhf,    0.649,    0.752,     0.863
---
 .../multiarch/svml_s_tanhf16_core_avx512.S    | 520 ++++++++++--------
 1 file changed, 280 insertions(+), 240 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index 5b1f9f151c..620e894e9d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -70,310 +70,350 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
+/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+
+/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
  */
-#define _sC				0
-#define _sP0				128
-#define _sP2				256
-#define _sP3				384
-#define _sP4				512
-#define _sP5				640
-#define _sP6				768
-#define _sP7				896
-#define _iExpMantMask_UISA		1024
-#define _iMinIdxOfsMask_UISA		1088
-#define _iMaxIdxMask_UISA		1152
-#define _sSignMask			1216
-#define _sAbsMask			1280
-#define _iExpMantMask			1344
-#define _iExpMask			1408
-#define _iMinIdxOfsMask			1472
-#define _iMaxIdxMask			1536
+#define _iExpMantMask_UISA		0
+#define _iMinIdxOfsMask_UISA		4
+#define _iMaxIdxMask_UISA		8
+#define _iExpMask			12
+
+/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
+   each.  */
+#define _sC_lo				0
+#define _sC_hi				64
+#define _sP7_lo				128
+#define _sP7_hi				192
+#define _sSignMask			256
+#define _sP6_lo				320
+#define _sP6_hi				384
+#define _sP5_lo				448
+#define _sP5_hi				512
+#define _sP4_lo				576
+#define _sP4_hi				640
+#define _sP3_lo				704
+#define _sP3_hi				768
+#define _sP2_lo				832
+#define _sP2_hi				896
+#define _sP0_lo				960
+#define _sP0_hi				1024
 
 #include <sysdep.h>
+#define TANHF_DATA(x)			((x)+__svml_stanh_data_internal_al64)
+#define TANHF_DATA_UNALIGNED(x)		((x)+__svml_stanh_data_internal)
 
 	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_tanhf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	vmovaps	%zmm0, %zmm1
-	vmovups	__svml_stanh_data_internal(%rip), %zmm9
-	vmovups	_sP6+__svml_stanh_data_internal(%rip), %zmm11
-	vmovups	_sP5+__svml_stanh_data_internal(%rip), %zmm12
-	vmovups	_sP4+__svml_stanh_data_internal(%rip), %zmm13
-	vmovups	_sP3+__svml_stanh_data_internal(%rip), %zmm14
-	vmovups	_sP2+__svml_stanh_data_internal(%rip), %zmm15
-	vpternlogd $255, %zmm2, %zmm2, %zmm2
-	vandps	_sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
-	vandps	_sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
-
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpandd	_iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
-	vpsubd	_iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
-	vpcmpd	$2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
+	vpandd	TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
+	vpsubd	TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
 
-	/*
-	 *  small table specific variables *
-	 *  Constant loading
-	 */
-	vpxord	%zmm5, %zmm5, %zmm5
-
-	/* if VMIN, VMAX is defined for I type */
-	vpmaxsd	%zmm5, %zmm4, %zmm6
-	vpminsd	_iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
-	vpsrld	$21, %zmm7, %zmm10
-	vmovups	_sP7+__svml_stanh_data_internal(%rip), %zmm4
-	vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
-	vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
-	vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
-	vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
-	vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
-	vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
-	vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
-	vpandnd	%zmm3, %zmm3, %zmm2{%k1}
-	vptestmd %zmm2, %zmm2, %k0
-	vmovups	_sP0+__svml_stanh_data_internal(%rip), %zmm3
-	vsubps	{rn-sae}, %zmm9, %zmm8, %zmm2
-	kmovw	%k0, %edx
-	vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
-	vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
-	vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
-	vorps	%zmm0, %zmm4, %zmm0
-	testl	%edx, %edx
+	/* Selection arguments between [0, 0x03e00000] into zmm3.  */
+	vpxord	%zmm3, %zmm3, %zmm3
+	vpmaxsd	%zmm3, %zmm2, %zmm3
+	vpminsd	TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
+	/* Setup permute indices in zmm3.  */
+	vpsrld	$21, %zmm3, %zmm3
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	/* Store if there are any special cases in k1.  */
+	vpcmpd	$6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
 
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
+	vmovaps	TANHF_DATA(_sC_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vmovaps	TANHF_DATA(_sP7_lo)(%rip), %zmm2
+	vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm1, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
+	/* Store absolute values of inputs in zmm1.  */
+	vmovaps	TANHF_DATA(_sSignMask)(%rip), %zmm4
+	vandnps	%zmm0, %zmm4, %zmm1
+	vsubps	{rn-sae}, %zmm5, %zmm1, %zmm1
 
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
+	vmovaps	TANHF_DATA(_sP6_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
 
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	vmovaps	TANHF_DATA(_sP5_lo)(%rip), %zmm6
+	vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
+
+	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	vmovaps	TANHF_DATA(_sP4_lo)(%rip), %zmm7
+	vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
+	vmovaps	TANHF_DATA(_sP3_lo)(%rip), %zmm8
+	vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
 
-	/* Special inputs
-	 * processing loop
+	vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
+
+	vmovaps	TANHF_DATA(_sP2_lo)(%rip), %zmm9
+	vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
+
+	vmovaps	TANHF_DATA(_sP0_lo)(%rip), %zmm10
+	vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
+
+	vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
+
+	kmovw	%k1, %edx
+	testl	%edx, %edx
+
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
+	/* Wait until after branch of write over zmm0.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
+
+	/* No stack restoration on the fastpath.  */
+	ret
+
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   moreso than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_def_cfa(rsp, 16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_def_cfa(rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa(rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa(r13, 32)
+
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+
+	/* Save origional input (zmm0 unchanged up to this point).  */
+	vmovaps	%zmm0, 64(%rsp)
+	/* Save all already computed inputs.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
+	vmovaps	%zmm0, (%rsp)
 
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa(rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa(rsp, 24)
+	popq	%rbx
+	cfi_def_cfa(rsp, 16)
+	popq	%r13
+	ret
 END(_ZGVeN16v_tanhf_skx)
 
 	.section .rodata, "a"
-	.align	64
-
+	.align	16
 #ifdef __svml_stanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 _sC[32][1];
-	__declspec(align(64)) VUINT32 _sP0[32][1];
-	__declspec(align(64)) VUINT32 _sP2[32][1];
-	__declspec(align(64)) VUINT32 _sP3[32][1];
-	__declspec(align(64)) VUINT32 _sP4[32][1];
-	__declspec(align(64)) VUINT32 _sP5[32][1];
-	__declspec(align(64)) VUINT32 _sP6[32][1];
-	__declspec(align(64)) VUINT32 _sP7[32][1];
-	__declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
-	__declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
-	__declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
+typedef struct
+	{
+	__declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iExpMask[1][1];
+	__declspec(align(64)) VUINT32 _sC_lo[16][1];
+	__declspec(align(64)) VUINT32 _sC_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP7_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP7_hi[16][1];
 	__declspec(align(64)) VUINT32 _sSignMask[16][1];
-	__declspec(align(64)) VUINT32 _sAbsMask[16][1];
-	__declspec(align(64)) VUINT32 _iExpMantMask[16][1];
-	__declspec(align(64)) VUINT32 _iExpMask[16][1];
-	__declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
-	__declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
+	__declspec(align(64)) VUINT32 _sP6_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP6_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP5_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP5_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP4_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP4_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP3_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP3_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP2_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP2_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP0_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP0_hi[16][1];
 } __svml_stanh_data_internal;
 #endif
+
 __svml_stanh_data_internal:
-	/* _sC */
+	.align	4
+	/* _iExpMantMask_UISA */
+	.long	0x7fe00000
+
+	.align	4
+	/* _iMinIdxOfsMask_UISA */
+	.long	0x3d400000
+
+	.align	4
+	/* _iMaxIdxMask_UISA */
+	.long	0x03e00000
+
+	.align	4
+	/* _iExpMask */
+	.long	0x7f000000
+
+	.align	64
+__svml_stanh_data_internal_al64:
+	.align	64
+	/* _sC_lo */
 	.long	0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
 	.long	0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
 	.long	0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
 	.long	0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
+
+	.align	64
+	/* _sC_hi */
 	.long	0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
 	.long	0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
 	.long	0x40500000, 0x40700000, 0x40900000, 0x40b00000
 	.long	0x40d00000, 0x40f00000, 0x41100000, 0x00000000
-	/* p0 */
-	.align	64
-	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
-	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
-	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
-	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
-	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
-	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
-	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
-	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
-	/* p2 */
-	.align	64
-	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
-	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
-	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
-	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
-	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
-	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
-	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
-	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
-	/* p3 */
+
 	.align	64
-	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
-	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
-	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
-	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
-	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
-	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
-	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
-	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
-	/* p4 */
+	/* _sP7_lo */
+	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
+	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
+	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
+	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
+
 	.align	64
-	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
-	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
-	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
-	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
-	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
-	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
-	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
-	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
-	/* p5 */
+	/* _sP7_hi */
+	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
+	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
+	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
+	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
+
 	.align	64
-	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
-	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
-	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
-	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
-	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
-	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
-	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
-	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
-	/* p6 */
+	/* _sSignMask */
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+
 	.align	64
+	/* _sP6_lo */
 	.long	0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
 	.long	0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
 	.long	0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
 	.long	0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
+
+	.align	64
+	/* _sP6_hi */
 	.long	0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
 	.long	0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
 	.long	0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
 	.long	0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
-	/* p7 */
+
 	.align	64
-	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
-	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
-	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
-	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
-	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
-	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
-	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
-	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
+	/* _sP5_lo */
+	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
+	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
+	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
+	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
+
 	.align	64
-	.long	0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMantMask_UISA */
+	/* _sP5_hi */
+	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
+	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
+	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
+	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
+
 	.align	64
-	.long	0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000 /* _iMinIdxOfsMask_UISA */
+	/* _sP4_lo */
+	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
+	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
+	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
+	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
+
 	.align	64
-	.long	0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000 /* _iMaxIdxMask_UISA */
+	/* _sP4_hi */
+	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
+	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
+	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
+	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
+
 	.align	64
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
+	/* _sP3_lo */
+	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
+	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
+	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
+	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
+
 	.align	64
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
+	/* _sP3_hi */
+	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
+	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
+	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
+	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
+
 	.align	64
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
+	/* _sP2_lo */
+	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
+	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
+	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
+	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
+
 	.align	64
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
+	/* _sP2_hi */
+	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
+	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
+	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
+	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
+
 	.align	64
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
+	/* _sP0_lo */
+	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
+	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
+	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
+	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
+
 	.align	64
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
+	/* _sP0_hi */
+	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
+	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
+	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
+	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
+
 	.align	64
+	.type	__svml_stanh_data_internal_al64, @object
+	.size	__svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
 	.type	__svml_stanh_data_internal, @object
 	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v1 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-06-07 20:06 ` [PATCH v1 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
@ 2022-06-07 20:06 ` Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, Andrey.Kolesov

tanhf-avx2 and tanhf-sse4 use the same data tables so we can save
over 4kb using a shared datatable. This does increase the memory
footprint of the sse4 version (as now all the targets are 32 bytes
instead of 16), generally it seems worth the code size save.

NB: This patch doesn't do anything itself, it is setup for future
patches.
---
 .../fpu/multiarch/svml_s_tanhf_rodata.S       | 621 ++++++++++++++++++
 1 file changed, 621 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
new file mode 100644
index 0000000000..904fe5f588
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
@@ -0,0 +1,621 @@
+/* Datatables for  tanhf AVX2 and tanhf SSE4.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/* Offsets are ordered by use in the function. On cold-starts this
+   might help the prefetcher. If the streaming prefetchers kick in it
+   will prefetch into the lookup table.  */
+#define _iExpMantMask			0
+#define _iMinIdxOfsMask			32
+#define _iMaxIdxMask			64
+#define _sAbsMask			96
+#define _iExpMask			128
+#define _lookupTable			160
+
+#define TANHF_DATA(offset)		((offset)+__svml_stanh_data_internal_avx2)
+#ifndef ONLY_DECL_OFFSET
+	.section .rodata, "a"
+	.align	32
+
+# ifdef __svml_stanh_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct
+	{
+	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
+	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
+	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
+	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
+	__declspec(align(32)) VUINT32 _iExpMask[8][1];
+	__declspec(align(32)) VUINT32 _lookupTable[(134*4)][2];
+} __svml_stanh_data_internal;
+# endif
+
+
+__svml_stanh_data_internal:
+	.globl	__svml_stanh_data_internal_avx2
+__svml_stanh_data_internal_avx2:
+	.align	32
+	/* _iExpMantMask.  */
+	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
+	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
+
+	.align	32
+	/* _iMinIdxOfsMask.  */
+	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
+	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
+
+	.align	32
+	/* _iMaxIdxMask.  */
+	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
+	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
+
+	.align	32
+	/* _sAbsMask.  */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+
+	.align	32
+	/* _iExpMask.  */
+	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
+	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
+
+	.align	32
+	/* _lookupTable.  */
+	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500].  */
+	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01.  */
+	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00.  */
+	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06.  */
+	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01.  */
+	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08.  */
+	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00.  */
+	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05.  */
+	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01.  */
+	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08.  */
+	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00.  */
+	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04.  */
+	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01.  */
+	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08.  */
+	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00.  */
+	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04.  */
+	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01.  */
+	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08.  */
+	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00.  */
+	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04.  */
+	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01.  */
+	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08.  */
+	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00.  */
+	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04.  */
+	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01.  */
+	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08.  */
+	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00.  */
+	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04.  */
+	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01.  */
+	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08.  */
+	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00.  */
+	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04.  */
+	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01.  */
+	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07.  */
+	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00.  */
+	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04.  */
+	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01.  */
+	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07.  */
+	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00.  */
+	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04.  */
+	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01.  */
+	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07.  */
+	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00.  */
+	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04.  */
+	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01.  */
+	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07.  */
+	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00.  */
+	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04.  */
+	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01.  */
+	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07.  */
+	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00.  */
+	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04.  */
+	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01.  */
+	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07.  */
+	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00.  */
+	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04.  */
+	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01.  */
+	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07.  */
+	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00.  */
+	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04.  */
+	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01.  */
+	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07.  */
+	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00.  */
+	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04.  */
+	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01.  */
+	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07.  */
+	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00.  */
+	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04.  */
+	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01.  */
+	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07.  */
+	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00.  */
+	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04.  */
+	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01.  */
+	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07.  */
+	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00.  */
+	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04.  */
+	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01.  */
+	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06.  */
+	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00.  */
+	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04.  */
+	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01.  */
+	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06.  */
+	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00.  */
+	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03.  */
+	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01.  */
+	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06.  */
+	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00.  */
+	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03.  */
+	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01.  */
+	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06.  */
+	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00.  */
+	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03.  */
+	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01.  */
+	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06.  */
+	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00.  */
+	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03.  */
+	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01.  */
+	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06.  */
+	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00.  */
+	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03.  */
+	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01.  */
+	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06.  */
+	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00.  */
+	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03.  */
+	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01.  */
+	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06.  */
+	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00.  */
+	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03.  */
+	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01.  */
+	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06.  */
+	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00.  */
+	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03.  */
+	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01.  */
+	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06.  */
+	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00.  */
+	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03.  */
+	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01.  */
+	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06.  */
+	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00.  */
+	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03.  */
+	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01.  */
+	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05.  */
+	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00.  */
+	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03.  */
+	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01.  */
+	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05.  */
+	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00.  */
+	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03.  */
+	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01.  */
+	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05.  */
+	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00.  */
+	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03.  */
+	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01.  */
+	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05.  */
+	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00.  */
+	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03.  */
+	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01.  */
+	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05.  */
+	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00.  */
+	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03.  */
+	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01.  */
+	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05.  */
+	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00.  */
+	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03.  */
+	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01.  */
+	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05.  */
+	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00.  */
+	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03.  */
+	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01.  */
+	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05.  */
+	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00.  */
+	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02.  */
+	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01.  */
+	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05.  */
+	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00.  */
+	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02.  */
+	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01.  */
+	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05.  */
+	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00.  */
+	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02.  */
+	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01.  */
+	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04.  */
+	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00.  */
+	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02.  */
+	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01.  */
+	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04.  */
+	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00.  */
+	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02.  */
+	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01.  */
+	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04.  */
+	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00.  */
+	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02.  */
+	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01.  */
+	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04.  */
+	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00.  */
+	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02.  */
+	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01.  */
+	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04.  */
+	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00.  */
+	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02.  */
+	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01.  */
+	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04.  */
+	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00.  */
+	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02.  */
+	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01.  */
+	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04.  */
+	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00.  */
+	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02.  */
+	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01.  */
+	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04.  */
+	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00.  */
+	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02.  */
+	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01.  */
+	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04.  */
+	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00.  */
+	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02.  */
+	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01.  */
+	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04.  */
+	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00.  */
+	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02.  */
+	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01.  */
+	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04.  */
+	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00.  */
+	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02.  */
+	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01.  */
+	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04.  */
+	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00.  */
+	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02.  */
+	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01.  */
+	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03.  */
+	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00.  */
+	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02.  */
+	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01.  */
+	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03.  */
+	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00.  */
+	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02.  */
+	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01.  */
+	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03.  */
+	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00.  */
+	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02.  */
+	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01.  */
+	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03.  */
+	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00.  */
+	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02.  */
+	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01.  */
+	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03.  */
+	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00.  */
+	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01.  */
+	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01.  */
+	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03.  */
+	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00.  */
+	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01.  */
+	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01.  */
+	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03.  */
+	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00.  */
+	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01.  */
+	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01.  */
+	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03.  */
+	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00.  */
+	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01.  */
+	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01.  */
+	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03.  */
+	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00.  */
+	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01.  */
+	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01.  */
+	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03.  */
+	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00.  */
+	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01.  */
+	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01.  */
+	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03.  */
+	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00.  */
+	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01.  */
+	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01.  */
+	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03.  */
+	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00.  */
+	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01.  */
+	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01.  */
+	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03.  */
+	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00.  */
+	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01.  */
+	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02.  */
+	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02.  */
+	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00.  */
+	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01.  */
+	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02.  */
+	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02.  */
+	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00.  */
+	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01.  */
+	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02.  */
+	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02.  */
+	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00.  */
+	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01.  */
+	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02.  */
+	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02.  */
+	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00.  */
+	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01.  */
+	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02.  */
+	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02.  */
+	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00.  */
+	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01.  */
+	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03.  */
+	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02.  */
+	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00.  */
+	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01.  */
+	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03.  */
+	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02.  */
+	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00.  */
+	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01.  */
+	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02.  */
+	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02.  */
+	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00.  */
+	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01.  */
+	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02.  */
+	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02.  */
+	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00.  */
+	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01.  */
+	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02.  */
+	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02.  */
+	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00.  */
+	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01.  */
+	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02.  */
+	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02.  */
+	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00.  */
+	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01.  */
+	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02.  */
+	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02.  */
+	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00.  */
+	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01.  */
+	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02.  */
+	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02.  */
+	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00.  */
+	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01.  */
+	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02.  */
+	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02.  */
+	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00.  */
+	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01.  */
+	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02.  */
+	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02.  */
+	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00.  */
+	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01.  */
+	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02.  */
+	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02.  */
+	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00.  */
+	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01.  */
+	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01.  */
+	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02.  */
+	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00.  */
+	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01.  */
+	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01.  */
+	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02.  */
+	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00.  */
+	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01.  */
+	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01.  */
+	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02.  */
+	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00.  */
+	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01.  */
+	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01.  */
+	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02.  */
+	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00.  */
+	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01.  */
+	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01.  */
+	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02.  */
+	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00.  */
+	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01.  */
+	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01.  */
+	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02.  */
+	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00.  */
+	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01.  */
+	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01.  */
+	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02.  */
+	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00.  */
+	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01.  */
+	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02.  */
+	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02.  */
+	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00.  */
+	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01.  */
+	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02.  */
+	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02.  */
+	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00.  */
+	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01.  */
+	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02.  */
+	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03.  */
+	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00.  */
+	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01.  */
+	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02.  */
+	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02.  */
+	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00.  */
+	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01.  */
+	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02.  */
+	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02.  */
+	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00.  */
+	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01.  */
+	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02.  */
+	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02.  */
+	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00.  */
+	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01.  */
+	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02.  */
+	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01.  */
+	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01.  */
+	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01.  */
+	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02.  */
+	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01.  */
+	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01.  */
+	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01.  */
+	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02.  */
+	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01.  */
+	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01.  */
+	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01.  */
+	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02.  */
+	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01.  */
+	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01.  */
+	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01.  */
+	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02.  */
+	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01.  */
+	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01.  */
+	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01.  */
+	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02.  */
+	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01.  */
+	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01.  */
+	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01.  */
+	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02.  */
+	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01.  */
+	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01.  */
+	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01.  */
+	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02.  */
+	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01.  */
+	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01.  */
+	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01.  */
+	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02.  */
+	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01.  */
+	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01.  */
+	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01.  */
+	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02.  */
+	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01.  */
+	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01.  */
+	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02.  */
+	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03.  */
+	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01.  */
+	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01.  */
+	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02.  */
+	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03.  */
+	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01.  */
+	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01.  */
+	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02.  */
+	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03.  */
+	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01.  */
+	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01.  */
+	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02.  */
+	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03.  */
+	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01.  */
+	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01.  */
+	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02.  */
+	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03.  */
+	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01.  */
+	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01.  */
+	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02.  */
+	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03.  */
+	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01.  */
+	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01.  */
+	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02.  */
+	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03.  */
+	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01.  */
+	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02.  */
+	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02.  */
+	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03.  */
+	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01.  */
+	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02.  */
+	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02.  */
+	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03.  */
+	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01.  */
+	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02.  */
+	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02.  */
+	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03.  */
+	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01.  */
+	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02.  */
+	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03.  */
+	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04.  */
+	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01.  */
+	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02.  */
+	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03.  */
+	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04.  */
+	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01.  */
+	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02.  */
+	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03.  */
+	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04.  */
+	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01.  */
+	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02.  */
+	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03.  */
+	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04.  */
+	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01.  */
+	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03.  */
+	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03.  */
+	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05.  */
+	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01.  */
+	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03.  */
+	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03.  */
+	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05.  */
+	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01.  */
+	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03.  */
+	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04.  */
+	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05.  */
+	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01.  */
+	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03.  */
+	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04.  */
+	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05.  */
+	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01.  */
+	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03.  */
+	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04.  */
+	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05.  */
+	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01.  */
+	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03.  */
+	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04.  */
+	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06.  */
+	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01.  */
+	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04.  */
+	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04.  */
+	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06.  */
+	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01.  */
+	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04.  */
+	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05.  */
+	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06.  */
+	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01.  */
+	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04.  */
+	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05.  */
+	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06.  */
+	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01.  */
+	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04.  */
+	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05.  */
+	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06.  */
+	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01.  */
+	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04.  */
+	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05.  */
+	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07.  */
+	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01.  */
+	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05.  */
+	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06.  */
+	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07.  */
+	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01.  */
+	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05.  */
+	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06.  */
+	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07.  */
+	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01.  */
+	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05.  */
+	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06.  */
+	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08.  */
+	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01.  */
+	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06.  */
+	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07.  */
+	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08.  */
+	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01.  */
+	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06.  */
+	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07.  */
+	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09.  */
+	.quad	0x3ff0000000000000
+	.quad	0x0000000000000000
+	.quad	0x0000000000000000
+	.quad	0x0000000000000000
+
+	.align	32
+	.type	__svml_stanh_data_internal_avx2, @object
+	.size	__svml_stanh_data_internal_avx2, .-__svml_stanh_data_internal_avx2
+	.type	__svml_stanh_data_internal, @object
+	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v1 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-06-07 20:06 ` [PATCH v1 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
@ 2022-06-07 20:06 ` Noah Goldstein
  2022-06-07 20:06 ` [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, Andrey.Kolesov

Optimizations are:
    1. Reduce code size (-81 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata size (-32 bytes).

Result is roughly a 17-18% speedup:

       Function, New Time, Old Time, New / Old
_ZGVdN8v_tanhf,     1.977,    2.402,     0.823
---
 .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 905 ++++--------------
 1 file changed, 164 insertions(+), 741 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index c5c87bf5b0..0d61d0d2ac 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -70,773 +70,196 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP				0
-#define _sSignMask			4288
-#define _sAbsMask			4320
-#define _iExpMantMask			4352
-#define _iExpMask			4384
-#define _iMinIdxOfsMask			4416
-#define _iMaxIdxMask			4448
-
 #include <sysdep.h>
 
+/* tanhf data tables for avx2 and sse4 implementatins defined here.
+ */
+#include "svml_s_tanhf_rodata.S"
+
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_tanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	pushq	%r12
-	subq	$120, %rsp
-	lea	_dbP+16+__svml_stanh_data_internal(%rip), %r10
-	vmovaps	%ymm0, %ymm12
-
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpand	_iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
+	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
+	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
+
+	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
+	vpxor	%ymm3, %ymm3, %ymm3
+	vpmaxsd	%ymm3, %ymm2, %ymm2
+	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
 
 	/*
 	 *  small table specific variables *
 	 *  Constant loading
 	 */
-	vmovups	_iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
-	vpsubd	_iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
-
-	/* if VMIN, VMAX is defined for I type */
-	vxorps	%ymm15, %ymm15, %ymm15
-	vpcmpgtd %ymm15, %ymm9, %ymm0
-	vpand	%ymm0, %ymm9, %ymm7
-	vpcmpgtd %ymm8, %ymm9, %ymm6
-	vblendvps %ymm6, %ymm8, %ymm7, %ymm3
-	vpsrld	$14, %ymm3, %ymm1
-	vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
-	vmovmskps %ymm13, %r11d
-	vandps	_sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
-	vandps	_sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
-	vextractf128 $1, %ymm1, %xmm2
-	vmovd	%xmm1, %r9d
-	vmovd	%xmm2, %ecx
-	vpextrd	$1, %xmm2, %edx
-	vpextrd	$1, %xmm1, %r8d
-	movslq	%r9d, %r9
-	movslq	%edx, %rdx
-	movslq	%r8d, %r8
-	vpextrd	$2, %xmm1, %edi
-	movslq	%ecx, %rcx
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	vpextrd	$3, %xmm2, %r12d
-	vpextrd	$3, %xmm1, %esi
-	vpextrd	$2, %xmm2, %eax
-	movslq	%edi, %rdi
-	movslq	%r12d, %r12
-	movslq	%esi, %rsi
-	movslq	%eax, %rax
-	vmovupd	-16(%r9, %r10), %xmm5
-	vmovupd	-16(%rdx, %r10), %xmm14
-	vmovupd	-16(%rcx, %r10), %xmm13
-	vmovupd	(%r9, %r10), %xmm1
-	vmovupd	(%r8, %r10), %xmm2
-	vmovupd	-16(%r8, %r10), %xmm4
-	vinsertf128 $1, -16(%rdi, %r10), %ymm5, %ymm15
-	vinsertf128 $1, -16(%r12, %r10), %ymm14, %ymm3
-	vinsertf128 $1, -16(%rax, %r10), %ymm13, %ymm6
-	vinsertf128 $1, (%rdi, %r10), %ymm1, %ymm5
-	vinsertf128 $1, (%rsi, %r10), %ymm2, %ymm14
-	vunpcklpd %ymm3, %ymm6, %ymm8
+	vpsrld	$14, %ymm2, %ymm1
+
+	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
+	   store/load as we can take advantage of store-forwarding.  */
+	vmovq	%xmm1, %r8
+	/* We have eliminated all negative values for ymm1 so no need to sign
+	   extend.  */
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+
+	/* Store base of lookup table in rax.  */
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+
+	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
+	   with memory operand. This helps alleviate bottleneck on p5.  */
+	vmovupd	16(%r9, %rax), %xmm5
+
+	vpextrq	$1, %xmm1, %rsi
+	movl	%esi, %edi
+	shrq	$32, %rsi
+
+	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
+
+	vextracti128 $1, %ymm1, %xmm2
+	vmovq	%xmm2, %rdx
+	movl	%edx, %ecx
+	shrq	$32, %rdx
+
+	vmovupd	(%rcx, %rax), %xmm6
+
+	vpextrq	$1, %xmm2, %r10
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+
+	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
+
+	vmovupd	16(%r8, %rax), %xmm1
+	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
+	vmovupd	(%rdx, %rax), %xmm3
+	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm6, %ymm7
 	vunpckhpd %ymm3, %ymm6, %ymm6
-	vunpcklpd %ymm14, %ymm5, %ymm3
-	vunpckhpd %ymm14, %ymm5, %ymm2
-	vmovupd	(%rcx, %r10), %xmm13
-	vcvtps2pd %xmm10, %ymm5
-	vextractf128 $1, %ymm10, %xmm10
-	vfmadd213pd %ymm3, %ymm5, %ymm2
-	vinsertf128 $1, -16(%rsi, %r10), %ymm4, %ymm0
-	vmovupd	(%rdx, %r10), %xmm4
-	vunpcklpd %ymm0, %ymm15, %ymm9
-	vunpckhpd %ymm0, %ymm15, %ymm7
-	vfmadd213pd %ymm7, %ymm5, %ymm2
-	vfmadd213pd %ymm9, %ymm5, %ymm2
-	vinsertf128 $1, (%r12, %r10), %ymm4, %ymm0
-	vcvtps2pd %xmm10, %ymm4
-	vinsertf128 $1, (%rax, %r10), %ymm13, %ymm15
-	vunpcklpd %ymm0, %ymm15, %ymm1
-	vunpckhpd %ymm0, %ymm15, %ymm0
-	vfmadd213pd %ymm1, %ymm4, %ymm0
-	vcvtpd2ps %ymm2, %xmm1
-	vfmadd213pd %ymm6, %ymm4, %ymm0
-	vfmadd213pd %ymm8, %ymm4, %ymm0
-	vcvtpd2ps %ymm0, %xmm0
-	vinsertf128 $1, %xmm0, %ymm1, %ymm2
-	vorps	%ymm11, %ymm2, %ymm0
-	testl	%r11d, %r11d
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r13 r14 r15 r11d ymm0 ymm12
+	vunpcklpd %ymm1, %ymm5, %ymm3
+	vunpckhpd %ymm1, %ymm5, %ymm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
+	/* Store special cases in ymm15.  */
+	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
 
-L(EXIT):
-	addq	$120, %rsp
-	cfi_restore(12)
-	popq	%r12
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
+	vandps	%ymm11, %ymm0, %ymm4
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vcvtps2pd %xmm4, %ymm5
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm12, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r13 r14 r15 r11d ymm0
+	vextractf128 $1, %ymm4, %xmm4
+	vcvtps2pd %xmm4, %ymm4
 
-	xorl	%r12d, %r12d
-	# LOE rbx r13 r14 r15 r11d r12d
+	vmovupd	16(%rcx, %rax), %xmm2
+	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
 
-	vzeroupper
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	movl	%r11d, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	vfmadd213pd %ymm3, %ymm5, %ymm1
+
+	vmovupd	16(%rdx, %rax), %xmm3
+	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm2, %ymm10
+	vunpckhpd %ymm3, %ymm2, %ymm2
+
+	vfmadd213pd %ymm10, %ymm4, %ymm2
+	vfmadd213pd %ymm6, %ymm4, %ymm2
+	vfmadd213pd %ymm7, %ymm4, %ymm2
+	vcvtpd2ps %ymm2, %xmm2
+
+	vmovupd	(%r9, %rax), %xmm7
+	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
+
+	vmovupd	(%r8, %rax), %xmm3
+	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
+
+	vunpckhpd %ymm3, %ymm7, %ymm4
+	vunpcklpd %ymm3, %ymm7, %ymm7
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	vfmadd213pd %ymm4, %ymm5, %ymm1
+	vfmadd213pd %ymm7, %ymm5, %ymm1
+
+
+	vcvtpd2ps %ymm1, %xmm1
+	vinsertf128 $1, %xmm2, %ymm1, %ymm1
+
+	vmovmskps %ymm15, %edx
+	vandnps	%ymm0, %ymm11, %ymm2
+	testl	%edx, %edx
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
+	/* Wait until after branch of write over ymm0.  */
+	vorps	%ymm2, %ymm1, %ymm0
+	/* No stack restoration on the fastpath.  */
+	ret
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
 
-	/* Special inputs
-	 * processing loop
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   moreso than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_def_cfa(rsp, 16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_def_cfa(rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa(rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa(r13, 32)
+
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save all already computed inputs.  */
+	vorps	%ymm2, %ymm1, %ymm1
+	vmovaps	%ymm1, (%rsp)
+	/* Save origional input (ymm0 unchanged up to this point).  */
+	vmovaps	%ymm0, 32(%rsp)
+
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
-END(_ZGVdN8v_tanhf_avx2)
 
-	.section .rodata, "a"
-	.align	32
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 _dbP[(134*4)][2];
-	__declspec(align(32)) VUINT32 _sSignMask[8][1];
-	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMask[8][1];
-	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
-	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	32
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
-	.align	32
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
-	.align	32
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
-	.align	32
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
-	.align	32
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
-	.align	32
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+	/* All results have been written to 32(%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa(rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa(rsp, 24)
+	popq	%rbx
+	cfi_def_cfa(rsp, 16)
+	popq	%r13
+	ret
+END(_ZGVdN8v_tanhf_avx2)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-06-07 20:06 ` [PATCH v1 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
@ 2022-06-07 20:06 ` Noah Goldstein
  2022-06-08  2:42   ` H.J. Lu
  2022-06-08  3:07   ` H.J. Lu
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                   ` (2 subsequent siblings)
  8 siblings, 2 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, Andrey.Kolesov

Optimizations are:
    1. Reduce code size (-112 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata size (-4k+ rodata is shared with avx2).

Result is roughly a 15-16% speedup:

       Function, New Time, Old Time, New / Old
 _ZGVbN4v_tanhf,    3.158,    3.749,     0.842
---
 .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 864 +++---------------
 1 file changed, 137 insertions(+), 727 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
index 532ebbac65..54580ebd79 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
@@ -70,761 +70,171 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP				0
-#define _sSignMask			4288
-#define _sAbsMask			4304
-#define _iExpMantMask			4320
-#define _iExpMask			4336
-#define _iMinIdxOfsMask			4352
-#define _iMaxIdxMask			4368
 
 #include <sysdep.h>
 
+/* tanhf data tables for avx2 and sse4 implementatins defined here.
+ */
+#define ONLY_DECL_OFFSET
+#include "svml_s_tanhf_rodata.S"
+
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_tanhf_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
-	movaps	%xmm0, %xmm5
+	/* Save copy of input in xmm12.  */
+	movaps	%xmm0, %xmm12
 
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	movdqu	_iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
-	lea	_dbP+16+__svml_stanh_data_internal(%rip), %r8
-	pand	%xmm5, %xmm9
+	movdqu	TANHF_DATA(_iExpMantMask)(%rip), %xmm3
+	pand	%xmm0, %xmm3
 
-	/* if VMIN, VMAX is defined for I type */
+
+	/* Selection of arguments between [0, 0x04280000] into xmm3.  */
 	pxor	%xmm7, %xmm7
-	movdqa	%xmm9, %xmm6
-	psubd	_iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
+	/* Save xmm3 for special values check at end.  */
+	movdqa	%xmm3, %xmm8
+	psubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
+	pmaxsd	%xmm7, %xmm3
+	pminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
+	psrld	$14, %xmm3
+
+	movq	%xmm3, %rcx
+	movl	%ecx, %edx
+	shrq	$32, %rcx
+
+	pshufd	$0x0e, %xmm3, %xmm3
+	movq	%xmm3, %rdi
+	movl	%edi, %esi
+	shrq	$32, %rdi
+
+	movaps	TANHF_DATA(_sAbsMask)(%rip), %xmm1
+	andps	%xmm1, %xmm0
+
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+	movups	(%rdx, %rax), %xmm2
+	movups	(%rcx, %rax), %xmm6
 
 	/*
 	 *  small table specific variables *
 	 *  Constant loading
 	 */
-	movdqu	_iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
-	movdqa	%xmm9, %xmm11
-	movdqa	%xmm9, %xmm8
-	pcmpgtd	%xmm10, %xmm11
-	pcmpgtd	%xmm7, %xmm8
-	movdqa	%xmm11, %xmm14
-	pand	%xmm8, %xmm9
-	andps	%xmm11, %xmm10
-	andnps	%xmm9, %xmm14
-	orps	%xmm10, %xmm14
-	psrld	$14, %xmm14
-	movd	%xmm14, %edx
-	pshufd	$1, %xmm14, %xmm12
-	pshufd	$2, %xmm14, %xmm13
-	movd	%xmm12, %ecx
-	pshufd	$3, %xmm14, %xmm15
-	movups	_sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
-	movslq	%edx, %rdx
-	andps	%xmm5, %xmm3
-	movslq	%ecx, %rcx
-	pcmpgtd	_iExpMask+__svml_stanh_data_internal(%rip), %xmm6
-	movd	%xmm13, %esi
-	movups	-16(%rdx, %r8), %xmm2
-	movaps	%xmm2, %xmm0
-	movd	%xmm15, %edi
-	movmskps %xmm6, %eax
-	movups	-16(%rcx, %r8), %xmm6
-	unpcklpd %xmm6, %xmm0
+	movaps	%xmm2, %xmm4
+	movlhps	%xmm6, %xmm4
 	unpckhpd %xmm6, %xmm2
-	cvtps2pd %xmm3, %xmm6
-	movhlps	%xmm3, %xmm3
-	cvtps2pd %xmm3, %xmm3
-	movslq	%esi, %rsi
-	movslq	%edi, %rdi
-	movups	(%rcx, %r8), %xmm8
-	movups	(%rdx, %r8), %xmm12
-	movups	(%rsi, %r8), %xmm13
-	movaps	%xmm12, %xmm10
-	movups	(%rdi, %r8), %xmm9
+
+	cvtps2pd %xmm0, %xmm6
+	movhlps	%xmm0, %xmm0
+	cvtps2pd %xmm0, %xmm0
+
+	movups	16(%rdx, %rax), %xmm5
+	movups	16(%rsi, %rax), %xmm13
+
+	movaps	%xmm5, %xmm10
 	movaps	%xmm13, %xmm11
-	unpckhpd %xmm8, %xmm12
-	unpckhpd %xmm9, %xmm13
-	mulpd	%xmm6, %xmm12
-	mulpd	%xmm3, %xmm13
-	unpcklpd %xmm8, %xmm10
-	unpcklpd %xmm9, %xmm11
-	addpd	%xmm10, %xmm12
+
+	movups	16(%rcx, %rax), %xmm7
+	movups	16(%rdi, %rax), %xmm3
+
+	unpckhpd %xmm7, %xmm5
+	unpckhpd %xmm3, %xmm13
+
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
+
+	movlhps	%xmm7, %xmm10
+	movlhps	%xmm3, %xmm11
+
+	addpd	%xmm10, %xmm5
 	addpd	%xmm11, %xmm13
-	mulpd	%xmm6, %xmm12
-	mulpd	%xmm3, %xmm13
-	addpd	%xmm2, %xmm12
-	movups	-16(%rsi, %r8), %xmm1
-	movups	-16(%rdi, %r8), %xmm7
-	movaps	%xmm1, %xmm14
-	unpckhpd %xmm7, %xmm1
-	addpd	%xmm1, %xmm13
-	mulpd	%xmm12, %xmm6
-	mulpd	%xmm13, %xmm3
-	addpd	%xmm0, %xmm6
-	unpcklpd %xmm7, %xmm14
-	addpd	%xmm14, %xmm3
-	cvtpd2ps %xmm6, %xmm0
-	cvtpd2ps %xmm3, %xmm1
-	movups	_sSignMask+__svml_stanh_data_internal(%rip), %xmm4
-	movlhps	%xmm1, %xmm0
-	andps	%xmm5, %xmm4
-	orps	%xmm4, %xmm0
-	testl	%eax, %eax
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	addpd	%xmm2, %xmm5
 
-L(EXIT):
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
-	ret
-	cfi_def_cfa_offset(80)
+	movups	(%rsi, %rax), %xmm2
+	movups	(%rdi, %rax), %xmm7
 
-	/* Branch to process
-	 * special inputs
-	 */
+	movaps	%xmm2, %xmm3
 
-L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm5, 32(%rsp)
-	movups	%xmm0, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 eax
-
-	xorl	%edx, %edx
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%edx, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%eax, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	unpckhpd %xmm7, %xmm2
+	movlhps	%xmm7, %xmm3
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	addpd	%xmm13, %xmm2
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
+	mulpd	%xmm5, %xmm6
+	addpd	%xmm4, %xmm6
 
-	/* Special inputs
-	 * processing loop
-	 */
+	mulpd	%xmm2, %xmm0
+	addpd	%xmm3, %xmm0
 
-L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	cvtpd2ps %xmm0, %xmm2
+	cvtpd2ps %xmm6, %xmm0
 
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	movlhps	%xmm2, %xmm0
+	andnps	%xmm12, %xmm1
+	orps	%xmm1, %xmm0
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	call	tanhf@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
+	/* xmm8 contains mask of special values.  */
+	pcmpgtd	TANHF_DATA(_iExpMask)(%rip), %xmm8
 
-	movss	%xmm0, 48(%rsp, %r14, 4)
+	movmskps %xmm8, %edx
+	testl	%edx, %edx
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
-END(_ZGVbN4v_tanhf_sse4)
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	/* No stack restoration on the fastpath.  */
+	ret
 
-	.section .rodata, "a"
-	.align	16
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 _dbP[(134*4)][2];
-	__declspec(align(16)) VUINT32 _sSignMask[4][1];
-	__declspec(align(16)) VUINT32 _sAbsMask[4][1];
-	__declspec(align(16)) VUINT32 _iExpMantMask[4][1];
-	__declspec(align(16)) VUINT32 _iExpMask[4][1];
-	__declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
-	__declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.align	16
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	16
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
-	.align	16
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
-	.align	16
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
-	.align	16
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
-	.align	16
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
-	.align	16
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   moreso than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+       call entry will be 16-byte aligned. */
+	subq	$56, %rsp
+
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm12, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+       encoding for many instructions (as compared with r12/r13). */
+	movq	%rbx, (%rsp)
+	cfi_offset(rbx, -16)
+	movq	%rbp, 8(%rsp)
+	cfi_offset(rbp, -8)
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
+L(SPECIAL_VALUES_LOOP):
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
+	call	tanhf@PLT
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+	/* All results have been written to 16(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore(rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore(rbp)
+	addq	$56, %rsp
+	ret
+END(_ZGVbN4v_tanhf_sse4)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S
  2022-06-07 20:06 ` [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
@ 2022-06-08  2:42   ` H.J. Lu
  2022-06-08  3:07   ` H.J. Lu
  1 sibling, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-08  2:42 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Kolesov, Andrey

On Tue, Jun 7, 2022 at 1:07 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Reduce code size (-112 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Reduce rodata size (-4k+ rodata is shared with avx2).
>
> Result is roughly a 15-16% speedup:
>
>        Function, New Time, Old Time, New / Old
>  _ZGVbN4v_tanhf,    3.158,    3.749,     0.842
> ---
>  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 864 +++---------------
>  1 file changed, 137 insertions(+), 727 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> index 532ebbac65..54580ebd79 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> @@ -70,761 +70,171 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _dbP                           0
> -#define _sSignMask                     4288
> -#define _sAbsMask                      4304
> -#define _iExpMantMask                  4320
> -#define _iExpMask                      4336
> -#define _iMinIdxOfsMask                        4352
> -#define _iMaxIdxMask                   4368
>
>  #include <sysdep.h>
>
> +/* tanhf data tables for avx2 and sse4 implementatins defined here.
> + */
> +#define ONLY_DECL_OFFSET
> +#include "svml_s_tanhf_rodata.S"
> +
>         .section .text.sse4, "ax", @progbits
>  ENTRY(_ZGVbN4v_tanhf_sse4)
> -       subq    $72, %rsp
> -       cfi_def_cfa_offset(80)
> -       movaps  %xmm0, %xmm5
> +       /* Save copy of input in xmm12.  */
> +       movaps  %xmm0, %xmm12
>
>         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       movdqu  _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> -       lea     _dbP+16+__svml_stanh_data_internal(%rip), %r8
> -       pand    %xmm5, %xmm9
> +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> +       pand    %xmm0, %xmm3
>
> -       /* if VMIN, VMAX is defined for I type */
> +
> +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
>         pxor    %xmm7, %xmm7
> -       movdqa  %xmm9, %xmm6
> -       psubd   _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> +       /* Save xmm3 for special values check at end.  */
> +       movdqa  %xmm3, %xmm8
> +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> +       pmaxsd  %xmm7, %xmm3
> +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> +       psrld   $14, %xmm3
> +
> +       movq    %xmm3, %rcx
> +       movl    %ecx, %edx
> +       shrq    $32, %rcx
> +
> +       pshufd  $0x0e, %xmm3, %xmm3
> +       movq    %xmm3, %rdi
> +       movl    %edi, %esi
> +       shrq    $32, %rdi
> +
> +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> +       andps   %xmm1, %xmm0
> +
> +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> +       movups  (%rdx, %rax), %xmm2
> +       movups  (%rcx, %rax), %xmm6
>
>         /*
>          *  small table specific variables *
>          *  Constant loading
>          */
> -       movdqu  _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> -       movdqa  %xmm9, %xmm11
> -       movdqa  %xmm9, %xmm8
> -       pcmpgtd %xmm10, %xmm11
> -       pcmpgtd %xmm7, %xmm8
> -       movdqa  %xmm11, %xmm14
> -       pand    %xmm8, %xmm9
> -       andps   %xmm11, %xmm10
> -       andnps  %xmm9, %xmm14
> -       orps    %xmm10, %xmm14
> -       psrld   $14, %xmm14
> -       movd    %xmm14, %edx
> -       pshufd  $1, %xmm14, %xmm12
> -       pshufd  $2, %xmm14, %xmm13
> -       movd    %xmm12, %ecx
> -       pshufd  $3, %xmm14, %xmm15
> -       movups  _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> -       movslq  %edx, %rdx
> -       andps   %xmm5, %xmm3
> -       movslq  %ecx, %rcx
> -       pcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> -       movd    %xmm13, %esi
> -       movups  -16(%rdx, %r8), %xmm2
> -       movaps  %xmm2, %xmm0
> -       movd    %xmm15, %edi
> -       movmskps %xmm6, %eax
> -       movups  -16(%rcx, %r8), %xmm6
> -       unpcklpd %xmm6, %xmm0
> +       movaps  %xmm2, %xmm4
> +       movlhps %xmm6, %xmm4
>         unpckhpd %xmm6, %xmm2
> -       cvtps2pd %xmm3, %xmm6
> -       movhlps %xmm3, %xmm3
> -       cvtps2pd %xmm3, %xmm3
> -       movslq  %esi, %rsi
> -       movslq  %edi, %rdi
> -       movups  (%rcx, %r8), %xmm8
> -       movups  (%rdx, %r8), %xmm12
> -       movups  (%rsi, %r8), %xmm13
> -       movaps  %xmm12, %xmm10
> -       movups  (%rdi, %r8), %xmm9
> +
> +       cvtps2pd %xmm0, %xmm6
> +       movhlps %xmm0, %xmm0
> +       cvtps2pd %xmm0, %xmm0
> +
> +       movups  16(%rdx, %rax), %xmm5
> +       movups  16(%rsi, %rax), %xmm13
> +
> +       movaps  %xmm5, %xmm10
>         movaps  %xmm13, %xmm11
> -       unpckhpd %xmm8, %xmm12
> -       unpckhpd %xmm9, %xmm13
> -       mulpd   %xmm6, %xmm12
> -       mulpd   %xmm3, %xmm13
> -       unpcklpd %xmm8, %xmm10
> -       unpcklpd %xmm9, %xmm11
> -       addpd   %xmm10, %xmm12
> +
> +       movups  16(%rcx, %rax), %xmm7
> +       movups  16(%rdi, %rax), %xmm3
> +
> +       unpckhpd %xmm7, %xmm5
> +       unpckhpd %xmm3, %xmm13
> +
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
> +
> +       movlhps %xmm7, %xmm10
> +       movlhps %xmm3, %xmm11
> +
> +       addpd   %xmm10, %xmm5
>         addpd   %xmm11, %xmm13
> -       mulpd   %xmm6, %xmm12
> -       mulpd   %xmm3, %xmm13
> -       addpd   %xmm2, %xmm12
> -       movups  -16(%rsi, %r8), %xmm1
> -       movups  -16(%rdi, %r8), %xmm7
> -       movaps  %xmm1, %xmm14
> -       unpckhpd %xmm7, %xmm1
> -       addpd   %xmm1, %xmm13
> -       mulpd   %xmm12, %xmm6
> -       mulpd   %xmm13, %xmm3
> -       addpd   %xmm0, %xmm6
> -       unpcklpd %xmm7, %xmm14
> -       addpd   %xmm14, %xmm3
> -       cvtpd2ps %xmm6, %xmm0
> -       cvtpd2ps %xmm3, %xmm1
> -       movups  _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> -       movlhps %xmm1, %xmm0
> -       andps   %xmm5, %xmm4
> -       orps    %xmm4, %xmm0
> -       testl   %eax, %eax
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       addpd   %xmm2, %xmm5
>
> -L(EXIT):
> -       addq    $72, %rsp
> -       cfi_def_cfa_offset(8)
> -       ret
> -       cfi_def_cfa_offset(80)
> +       movups  (%rsi, %rax), %xmm2
> +       movups  (%rdi, %rax), %xmm7
>
> -       /* Branch to process
> -        * special inputs
> -        */
> +       movaps  %xmm2, %xmm3
>
> -L(SPECIAL_VALUES_BRANCH):
> -       movups  %xmm5, 32(%rsp)
> -       movups  %xmm0, 48(%rsp)
> -       # LOE rbx rbp r12 r13 r14 r15 eax
> -
> -       xorl    %edx, %edx
> -       movq    %r12, 16(%rsp)
> -       cfi_offset(12, -64)
> -       movl    %edx, %r12d
> -       movq    %r13, 8(%rsp)
> -       cfi_offset(13, -72)
> -       movl    %eax, %r13d
> -       movq    %r14, (%rsp)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> +       unpckhpd %xmm7, %xmm2
> +       movlhps %xmm7, %xmm3
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       addpd   %xmm13, %xmm2
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx rbp r15 r12d r13d
> +       mulpd   %xmm5, %xmm6
> +       addpd   %xmm4, %xmm6
>
> -       /* Special inputs
> -        * processing loop
> -        */
> +       mulpd   %xmm2, %xmm0
> +       addpd   %xmm3, %xmm0
>
> -L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $4, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       movups  48(%rsp), %xmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       cfi_offset(12, -64)
> -       cfi_offset(13, -72)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       cvtpd2ps %xmm0, %xmm2
> +       cvtpd2ps %xmm6, %xmm0
>
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> +       movlhps %xmm2, %xmm0
> +       andnps  %xmm12, %xmm1
> +       orps    %xmm1, %xmm0
>
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> -       call    tanhf@PLT
> -       # LOE rbx rbp r14 r15 r12d r13d xmm0
> +       /* xmm8 contains mask of special values.  */
> +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
>
> -       movss   %xmm0, 48(%rsp, %r14, 4)
> +       movmskps %xmm8, %edx
> +       testl   %edx, %edx
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx rbp r15 r12d r13d
> -END(_ZGVbN4v_tanhf_sse4)
> +       /* Go to special inputs processing branch */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       /* No stack restoration on the fastpath.  */
> +       ret
>
> -       .section .rodata, "a"
> -       .align  16
> -
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> -       __declspec(align(16)) VUINT32 _sSignMask[4][1];
> -       __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> -       __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> -       __declspec(align(16)) VUINT32 _iExpMask[4][1];
> -       __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> -       __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> -} __svml_stanh_data_internal;
> -#endif
> -__svml_stanh_data_internal:
> -       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> -       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
> -       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
> -       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
> -       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
> -       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
> -       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
> -       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
> -       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
> -       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
> -       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
> -       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
> -       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
> -       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
> -       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
> -       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
> -       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
> -       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
> -       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
> -       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
> -       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
> -       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
> -       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
> -       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
> -       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
> -       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
> -       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
> -       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
> -       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
> -       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
> -       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
> -       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
> -       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
> -       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
> -       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
> -       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
> -       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
> -       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
> -       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
> -       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
> -       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
> -       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
> -       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
> -       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
> -       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
> -       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
> -       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
> -       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
> -       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
> -       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
> -       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
> -       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
> -       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
> -       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
> -       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
> -       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
> -       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
> -       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
> -       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
> -       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
> -       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
> -       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
> -       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
> -       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
> -       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
> -       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
> -       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
> -       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
> -       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
> -       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
> -       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
> -       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
> -       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
> -       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
> -       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
> -       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
> -       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
> -       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
> -       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
> -       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
> -       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
> -       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
> -       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
> -       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
> -       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
> -       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
> -       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
> -       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
> -       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
> -       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
> -       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
> -       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
> -       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
> -       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
> -       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
> -       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
> -       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
> -       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
> -       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
> -       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
> -       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
> -       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
> -       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
> -       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
> -       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
> -       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
> -       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
> -       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
> -       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
> -       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
> -       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
> -       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
> -       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
> -       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
> -       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
> -       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
> -       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
> -       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
> -       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
> -       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
> -       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
> -       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
> -       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
> -       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
> -       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
> -       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
> -       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
> -       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
> -       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
> -       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
> -       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
> -       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
> -       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
> -       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
> -       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
> -       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
> -       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
> -       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
> -       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
> -       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
> -       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
> -       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
> -       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
> -       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
> -       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
> -       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
> -       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
> -       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
> -       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
> -       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
> -       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
> -       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
> -       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
> -       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
> -       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
> -       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
> -       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
> -       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
> -       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
> -       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
> -       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
> -       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
> -       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
> -       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
> -       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
> -       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
> -       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
> -       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
> -       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
> -       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
> -       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
> -       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
> -       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
> -       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
> -       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
> -       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
> -       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
> -       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
> -       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
> -       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
> -       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
> -       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
> -       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
> -       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
> -       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
> -       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
> -       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
> -       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
> -       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
> -       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
> -       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
> -       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
> -       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
> -       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
> -       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
> -       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
> -       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
> -       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
> -       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
> -       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
> -       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
> -       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
> -       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
> -       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
> -       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
> -       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
> -       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
> -       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
> -       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
> -       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
> -       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
> -       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
> -       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
> -       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
> -       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
> -       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
> -       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
> -       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
> -       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
> -       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
> -       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
> -       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
> -       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
> -       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
> -       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
> -       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
> -       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
> -       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
> -       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
> -       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
> -       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
> -       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
> -       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
> -       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
> -       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
> -       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
> -       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
> -       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
> -       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
> -       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
> -       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
> -       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
> -       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
> -       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
> -       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
> -       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
> -       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
> -       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
> -       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
> -       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
> -       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
> -       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
> -       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
> -       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
> -       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
> -       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
> -       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
> -       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
> -       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
> -       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
> -       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
> -       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
> -       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
> -       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
> -       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
> -       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
> -       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
> -       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
> -       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
> -       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
> -       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
> -       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
> -       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
> -       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
> -       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
> -       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
> -       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
> -       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
> -       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
> -       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
> -       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
> -       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
> -       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
> -       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
> -       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
> -       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
> -       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
> -       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
> -       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
> -       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
> -       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
> -       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
> -       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
> -       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
> -       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
> -       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
> -       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
> -       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
> -       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
> -       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
> -       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
> -       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
> -       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
> -       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
> -       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
> -       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
> -       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
> -       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
> -       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
> -       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
> -       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
> -       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
> -       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
> -       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
> -       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
> -       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
> -       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
> -       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
> -       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
> -       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
> -       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
> -       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
> -       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
> -       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
> -       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
> -       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
> -       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
> -       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
> -       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
> -       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
> -       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
> -       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
> -       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
> -       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
> -       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
> -       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
> -       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
> -       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
> -       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
> -       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
> -       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
> -       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
> -       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
> -       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
> -       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
> -       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
> -       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
> -       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
> -       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
> -       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
> -       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
> -       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
> -       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
> -       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
> -       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
> -       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
> -       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
> -       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
> -       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
> -       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
> -       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
> -       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
> -       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
> -       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
> -       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
> -       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
> -       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
> -       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
> -       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
> -       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
> -       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
> -       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
> -       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
> -       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
> -       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
> -       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
> -       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
> -       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
> -       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
> -       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
> -       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
> -       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
> -       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
> -       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
> -       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
> -       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
> -       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
> -       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
> -       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
> -       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
> -       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
> -       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
> -       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
> -       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
> -       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
> -       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
> -       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
> -       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
> -       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
> -       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
> -       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
> -       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
> -       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
> -       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
> -       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
> -       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
> -       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
> -       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
> -       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
> -       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
> -       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
> -       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
> -       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
> -       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
> -       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
> -       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
> -       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
> -       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
> -       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
> -       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
> -       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
> -       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
> -       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
> -       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
> -       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
> -       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
> -       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
> -       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
> -       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
> -       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
> -       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
> -       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
> -       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
> -       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
> -       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
> -       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
> -       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
> -       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
> -       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
> -       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
> -       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
> -       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
> -       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
> -       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
> -       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
> -       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
> -       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
> -       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
> -       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
> -       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
> -       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
> -       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
> -       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
> -       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
> -       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
> -       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
> -       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
> -       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
> -       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
> -       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
> -       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
> -       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
> -       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
> -       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
> -       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
> -       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
> -       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
> -       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
> -       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
> -       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
> -       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
> -       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
> -       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
> -       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
> -       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
> -       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
> -       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
> -       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
> -       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
> -       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
> -       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
> -       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
> -       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
> -       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
> -       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
> -       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
> -       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
> -       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
> -       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
> -       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
> -       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
> -       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
> -       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
> -       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
> -       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
> -       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
> -       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
> -       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
> -       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
> -       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
> -       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
> -       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
> -       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
> -       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
> -       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
> -       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
> -       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
> -       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
> -       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
> -       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
> -       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
> -       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
> -       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
> -       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
> -       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
> -       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
> -       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
> -       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
> -       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
> -       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
> -       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
> -       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
> -       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
> -       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
> -       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
> -       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
> -       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
> -       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
> -       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
> -       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
> -       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
> -       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
> -       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
> -       .quad   0x3ff0000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .align  16
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> -       .align  16
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> -       .align  16
> -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> -       .align  16
> -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> -       .align  16
> -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> -       .align  16
> -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> -       .align  16
> -       .type   __svml_stanh_data_internal, @object
> -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a tanhf call. Optimize for code size
> +          moreso than speed here. */
> +L(SPECIAL_VALUES_BRANCH):
> +       # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
> +       /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
> +       call entry will be 16-byte aligned. */
> +       subq    $56, %rsp
> +
> +       movups  %xmm0, 24(%rsp)
> +       movups  %xmm12, 40(%rsp)
> +
> +       /* Use rbx/rbp for callee save registers as they get short
> +       encoding for many instructions (as compared with r12/r13). */
> +       movq    %rbx, (%rsp)
> +       cfi_offset(rbx, -16)
> +       movq    %rbp, 8(%rsp)
> +       cfi_offset(rbp, -8)
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
> +L(SPECIAL_VALUES_LOOP):
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %ebp, %ebp
> +       bsfl    %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
                                   function
> +       movss   40(%rsp, %rbp, 4), %xmm0
> +       call    tanhf@PLT
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, 24(%rsp, %rbp, 4)
> +
> +       leal    -1(%rbx), %eax
> +       andl    %eax, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +       /* All results have been written to 16(%rsp).  */
                                                                 ^^^ 24?
> +       movups  24(%rsp), %xmm0
> +       movq    (%rsp), %rbx
> +       cfi_restore(rbx)
> +       movq    8(%rsp), %rbp
> +       cfi_restore(rbp)
> +       addq    $56, %rsp
> +       ret
> +END(_ZGVbN4v_tanhf_sse4)
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S
  2022-06-07 20:06 ` [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
  2022-06-08  2:42   ` H.J. Lu
@ 2022-06-08  3:07   ` H.J. Lu
  2022-06-09  0:06     ` Noah Goldstein
  1 sibling, 1 reply; 48+ messages in thread
From: H.J. Lu @ 2022-06-08  3:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Kolesov, Andrey

On Tue, Jun 7, 2022 at 1:07 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Reduce code size (-112 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Reduce rodata size (-4k+ rodata is shared with avx2).
>
> Result is roughly a 15-16% speedup:
>
>        Function, New Time, Old Time, New / Old
>  _ZGVbN4v_tanhf,    3.158,    3.749,     0.842
> ---
>  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 864 +++---------------
>  1 file changed, 137 insertions(+), 727 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> index 532ebbac65..54580ebd79 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> @@ -70,761 +70,171 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _dbP                           0
> -#define _sSignMask                     4288
> -#define _sAbsMask                      4304
> -#define _iExpMantMask                  4320
> -#define _iExpMask                      4336
> -#define _iMinIdxOfsMask                        4352
> -#define _iMaxIdxMask                   4368
>
>  #include <sysdep.h>
>
> +/* tanhf data tables for avx2 and sse4 implementatins defined here.
> + */
> +#define ONLY_DECL_OFFSET
> +#include "svml_s_tanhf_rodata.S"
> +
>         .section .text.sse4, "ax", @progbits
>  ENTRY(_ZGVbN4v_tanhf_sse4)
> -       subq    $72, %rsp
> -       cfi_def_cfa_offset(80)
> -       movaps  %xmm0, %xmm5
> +       /* Save copy of input in xmm12.  */
> +       movaps  %xmm0, %xmm12
>
>         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       movdqu  _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> -       lea     _dbP+16+__svml_stanh_data_internal(%rip), %r8
> -       pand    %xmm5, %xmm9
> +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> +       pand    %xmm0, %xmm3
>
> -       /* if VMIN, VMAX is defined for I type */
> +
> +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
>         pxor    %xmm7, %xmm7
> -       movdqa  %xmm9, %xmm6
> -       psubd   _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> +       /* Save xmm3 for special values check at end.  */
> +       movdqa  %xmm3, %xmm8
> +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> +       pmaxsd  %xmm7, %xmm3
> +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> +       psrld   $14, %xmm3
> +
> +       movq    %xmm3, %rcx
> +       movl    %ecx, %edx
> +       shrq    $32, %rcx
> +
> +       pshufd  $0x0e, %xmm3, %xmm3
> +       movq    %xmm3, %rdi
> +       movl    %edi, %esi
> +       shrq    $32, %rdi
> +
> +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> +       andps   %xmm1, %xmm0
> +
> +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> +       movups  (%rdx, %rax), %xmm2
> +       movups  (%rcx, %rax), %xmm6
>
>         /*
>          *  small table specific variables *
>          *  Constant loading
>          */
> -       movdqu  _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> -       movdqa  %xmm9, %xmm11
> -       movdqa  %xmm9, %xmm8
> -       pcmpgtd %xmm10, %xmm11
> -       pcmpgtd %xmm7, %xmm8
> -       movdqa  %xmm11, %xmm14
> -       pand    %xmm8, %xmm9
> -       andps   %xmm11, %xmm10
> -       andnps  %xmm9, %xmm14
> -       orps    %xmm10, %xmm14
> -       psrld   $14, %xmm14
> -       movd    %xmm14, %edx
> -       pshufd  $1, %xmm14, %xmm12
> -       pshufd  $2, %xmm14, %xmm13
> -       movd    %xmm12, %ecx
> -       pshufd  $3, %xmm14, %xmm15
> -       movups  _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> -       movslq  %edx, %rdx
> -       andps   %xmm5, %xmm3
> -       movslq  %ecx, %rcx
> -       pcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> -       movd    %xmm13, %esi
> -       movups  -16(%rdx, %r8), %xmm2
> -       movaps  %xmm2, %xmm0
> -       movd    %xmm15, %edi
> -       movmskps %xmm6, %eax
> -       movups  -16(%rcx, %r8), %xmm6
> -       unpcklpd %xmm6, %xmm0
> +       movaps  %xmm2, %xmm4
> +       movlhps %xmm6, %xmm4
>         unpckhpd %xmm6, %xmm2
> -       cvtps2pd %xmm3, %xmm6
> -       movhlps %xmm3, %xmm3
> -       cvtps2pd %xmm3, %xmm3
> -       movslq  %esi, %rsi
> -       movslq  %edi, %rdi
> -       movups  (%rcx, %r8), %xmm8
> -       movups  (%rdx, %r8), %xmm12
> -       movups  (%rsi, %r8), %xmm13
> -       movaps  %xmm12, %xmm10
> -       movups  (%rdi, %r8), %xmm9
> +
> +       cvtps2pd %xmm0, %xmm6
> +       movhlps %xmm0, %xmm0
> +       cvtps2pd %xmm0, %xmm0
> +
> +       movups  16(%rdx, %rax), %xmm5
> +       movups  16(%rsi, %rax), %xmm13
> +
> +       movaps  %xmm5, %xmm10
>         movaps  %xmm13, %xmm11
> -       unpckhpd %xmm8, %xmm12
> -       unpckhpd %xmm9, %xmm13
> -       mulpd   %xmm6, %xmm12
> -       mulpd   %xmm3, %xmm13
> -       unpcklpd %xmm8, %xmm10
> -       unpcklpd %xmm9, %xmm11
> -       addpd   %xmm10, %xmm12
> +
> +       movups  16(%rcx, %rax), %xmm7
> +       movups  16(%rdi, %rax), %xmm3
> +
> +       unpckhpd %xmm7, %xmm5
> +       unpckhpd %xmm3, %xmm13
> +
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
> +
> +       movlhps %xmm7, %xmm10
> +       movlhps %xmm3, %xmm11
> +
> +       addpd   %xmm10, %xmm5
>         addpd   %xmm11, %xmm13
> -       mulpd   %xmm6, %xmm12
> -       mulpd   %xmm3, %xmm13
> -       addpd   %xmm2, %xmm12
> -       movups  -16(%rsi, %r8), %xmm1
> -       movups  -16(%rdi, %r8), %xmm7
> -       movaps  %xmm1, %xmm14
> -       unpckhpd %xmm7, %xmm1
> -       addpd   %xmm1, %xmm13
> -       mulpd   %xmm12, %xmm6
> -       mulpd   %xmm13, %xmm3
> -       addpd   %xmm0, %xmm6
> -       unpcklpd %xmm7, %xmm14
> -       addpd   %xmm14, %xmm3
> -       cvtpd2ps %xmm6, %xmm0
> -       cvtpd2ps %xmm3, %xmm1
> -       movups  _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> -       movlhps %xmm1, %xmm0
> -       andps   %xmm5, %xmm4
> -       orps    %xmm4, %xmm0
> -       testl   %eax, %eax
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       addpd   %xmm2, %xmm5
>
> -L(EXIT):
> -       addq    $72, %rsp
> -       cfi_def_cfa_offset(8)
> -       ret
> -       cfi_def_cfa_offset(80)
> +       movups  (%rsi, %rax), %xmm2
> +       movups  (%rdi, %rax), %xmm7
>
> -       /* Branch to process
> -        * special inputs
> -        */
> +       movaps  %xmm2, %xmm3
>
> -L(SPECIAL_VALUES_BRANCH):
> -       movups  %xmm5, 32(%rsp)
> -       movups  %xmm0, 48(%rsp)
> -       # LOE rbx rbp r12 r13 r14 r15 eax
> -
> -       xorl    %edx, %edx
> -       movq    %r12, 16(%rsp)
> -       cfi_offset(12, -64)
> -       movl    %edx, %r12d
> -       movq    %r13, 8(%rsp)
> -       cfi_offset(13, -72)
> -       movl    %eax, %r13d
> -       movq    %r14, (%rsp)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> +       unpckhpd %xmm7, %xmm2
> +       movlhps %xmm7, %xmm3
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       addpd   %xmm13, %xmm2
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx rbp r15 r12d r13d
> +       mulpd   %xmm5, %xmm6
> +       addpd   %xmm4, %xmm6
>
> -       /* Special inputs
> -        * processing loop
> -        */
> +       mulpd   %xmm2, %xmm0
> +       addpd   %xmm3, %xmm0
>
> -L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $4, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       movups  48(%rsp), %xmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       cfi_offset(12, -64)
> -       cfi_offset(13, -72)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       cvtpd2ps %xmm0, %xmm2
> +       cvtpd2ps %xmm6, %xmm0
>
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> +       movlhps %xmm2, %xmm0
> +       andnps  %xmm12, %xmm1
> +       orps    %xmm1, %xmm0
>
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> -       call    tanhf@PLT
> -       # LOE rbx rbp r14 r15 r12d r13d xmm0
> +       /* xmm8 contains mask of special values.  */
> +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
>
> -       movss   %xmm0, 48(%rsp, %r14, 4)
> +       movmskps %xmm8, %edx
> +       testl   %edx, %edx
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx rbp r15 r12d r13d
> -END(_ZGVbN4v_tanhf_sse4)
> +       /* Go to special inputs processing branch */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       /* No stack restoration on the fastpath.  */
> +       ret
>
> -       .section .rodata, "a"
> -       .align  16
> -
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> -       __declspec(align(16)) VUINT32 _sSignMask[4][1];
> -       __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> -       __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> -       __declspec(align(16)) VUINT32 _iExpMask[4][1];
> -       __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> -       __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> -} __svml_stanh_data_internal;
> -#endif
> -__svml_stanh_data_internal:
> -       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> -       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
> -       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
> -       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
> -       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
> -       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
> -       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
> -       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
> -       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
> -       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
> -       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
> -       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
> -       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
> -       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
> -       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
> -       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
> -       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
> -       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
> -       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
> -       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
> -       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
> -       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
> -       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
> -       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
> -       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
> -       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
> -       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
> -       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
> -       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
> -       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
> -       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
> -       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
> -       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
> -       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
> -       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
> -       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
> -       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
> -       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
> -       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
> -       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
> -       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
> -       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
> -       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
> -       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
> -       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
> -       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
> -       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
> -       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
> -       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
> -       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
> -       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
> -       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
> -       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
> -       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
> -       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
> -       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
> -       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
> -       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
> -       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
> -       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
> -       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
> -       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
> -       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
> -       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
> -       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
> -       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
> -       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
> -       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
> -       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
> -       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
> -       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
> -       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
> -       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
> -       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
> -       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
> -       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
> -       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
> -       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
> -       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
> -       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
> -       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
> -       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
> -       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
> -       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
> -       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
> -       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
> -       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
> -       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
> -       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
> -       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
> -       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
> -       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
> -       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
> -       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
> -       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
> -       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
> -       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
> -       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
> -       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
> -       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
> -       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
> -       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
> -       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
> -       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
> -       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
> -       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
> -       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
> -       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
> -       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
> -       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
> -       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
> -       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
> -       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
> -       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
> -       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
> -       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
> -       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
> -       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
> -       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
> -       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
> -       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
> -       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
> -       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
> -       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
> -       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
> -       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
> -       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
> -       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
> -       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
> -       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
> -       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
> -       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
> -       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
> -       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
> -       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
> -       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
> -       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
> -       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
> -       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
> -       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
> -       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
> -       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
> -       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
> -       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
> -       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
> -       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
> -       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
> -       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
> -       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
> -       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
> -       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
> -       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
> -       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
> -       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
> -       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
> -       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
> -       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
> -       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
> -       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
> -       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
> -       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
> -       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
> -       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
> -       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
> -       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
> -       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
> -       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
> -       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
> -       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
> -       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
> -       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
> -       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
> -       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
> -       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
> -       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
> -       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
> -       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
> -       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
> -       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
> -       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
> -       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
> -       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
> -       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
> -       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
> -       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
> -       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
> -       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
> -       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
> -       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
> -       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
> -       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
> -       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
> -       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
> -       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
> -       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
> -       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
> -       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
> -       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
> -       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
> -       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
> -       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
> -       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
> -       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
> -       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
> -       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
> -       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
> -       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
> -       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
> -       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
> -       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
> -       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
> -       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
> -       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
> -       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
> -       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
> -       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
> -       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
> -       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
> -       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
> -       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
> -       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
> -       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
> -       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
> -       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
> -       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
> -       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
> -       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
> -       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
> -       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
> -       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
> -       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
> -       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
> -       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
> -       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
> -       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
> -       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
> -       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
> -       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
> -       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
> -       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
> -       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
> -       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
> -       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
> -       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
> -       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
> -       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
> -       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
> -       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
> -       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
> -       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
> -       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
> -       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
> -       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
> -       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
> -       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
> -       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
> -       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
> -       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
> -       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
> -       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
> -       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
> -       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
> -       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
> -       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
> -       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
> -       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
> -       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
> -       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
> -       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
> -       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
> -       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
> -       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
> -       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
> -       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
> -       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
> -       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
> -       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
> -       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
> -       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
> -       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
> -       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
> -       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
> -       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
> -       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
> -       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
> -       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
> -       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
> -       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
> -       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
> -       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
> -       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
> -       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
> -       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
> -       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
> -       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
> -       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
> -       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
> -       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
> -       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
> -       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
> -       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
> -       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
> -       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
> -       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
> -       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
> -       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
> -       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
> -       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
> -       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
> -       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
> -       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
> -       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
> -       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
> -       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
> -       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
> -       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
> -       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
> -       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
> -       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
> -       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
> -       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
> -       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
> -       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
> -       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
> -       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
> -       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
> -       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
> -       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
> -       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
> -       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
> -       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
> -       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
> -       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
> -       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
> -       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
> -       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
> -       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
> -       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
> -       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
> -       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
> -       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
> -       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
> -       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
> -       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
> -       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
> -       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
> -       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
> -       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
> -       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
> -       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
> -       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
> -       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
> -       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
> -       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
> -       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
> -       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
> -       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
> -       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
> -       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
> -       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
> -       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
> -       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
> -       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
> -       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
> -       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
> -       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
> -       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
> -       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
> -       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
> -       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
> -       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
> -       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
> -       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
> -       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
> -       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
> -       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
> -       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
> -       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
> -       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
> -       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
> -       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
> -       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
> -       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
> -       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
> -       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
> -       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
> -       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
> -       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
> -       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
> -       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
> -       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
> -       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
> -       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
> -       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
> -       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
> -       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
> -       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
> -       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
> -       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
> -       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
> -       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
> -       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
> -       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
> -       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
> -       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
> -       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
> -       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
> -       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
> -       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
> -       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
> -       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
> -       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
> -       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
> -       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
> -       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
> -       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
> -       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
> -       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
> -       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
> -       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
> -       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
> -       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
> -       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
> -       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
> -       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
> -       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
> -       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
> -       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
> -       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
> -       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
> -       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
> -       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
> -       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
> -       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
> -       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
> -       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
> -       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
> -       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
> -       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
> -       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
> -       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
> -       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
> -       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
> -       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
> -       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
> -       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
> -       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
> -       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
> -       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
> -       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
> -       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
> -       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
> -       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
> -       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
> -       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
> -       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
> -       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
> -       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
> -       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
> -       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
> -       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
> -       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
> -       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
> -       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
> -       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
> -       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
> -       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
> -       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
> -       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
> -       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
> -       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
> -       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
> -       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
> -       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
> -       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
> -       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
> -       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
> -       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
> -       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
> -       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
> -       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
> -       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
> -       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
> -       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
> -       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
> -       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
> -       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
> -       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
> -       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
> -       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
> -       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
> -       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
> -       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
> -       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
> -       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
> -       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
> -       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
> -       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
> -       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
> -       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
> -       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
> -       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
> -       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
> -       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
> -       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
> -       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
> -       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
> -       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
> -       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
> -       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
> -       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
> -       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
> -       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
> -       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
> -       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
> -       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
> -       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
> -       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
> -       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
> -       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
> -       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
> -       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
> -       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
> -       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
> -       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
> -       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
> -       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
> -       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
> -       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
> -       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
> -       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
> -       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
> -       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
> -       .quad   0x3ff0000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .align  16
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> -       .align  16
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> -       .align  16
> -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> -       .align  16
> -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> -       .align  16
> -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> -       .align  16
> -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> -       .align  16
> -       .type   __svml_stanh_data_internal, @object
> -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a tanhf call. Optimize for code size
> +          moreso than speed here. */
> +L(SPECIAL_VALUES_BRANCH):
> +       # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
> +       /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
> +       call entry will be 16-byte aligned. */
> +       subq    $56, %rsp

There is no CFI adjustment.

> +
> +       movups  %xmm0, 24(%rsp)
> +       movups  %xmm12, 40(%rsp)
> +
> +       /* Use rbx/rbp for callee save registers as they get short
> +       encoding for many instructions (as compared with r12/r13). */
> +       movq    %rbx, (%rsp)
> +       cfi_offset(rbx, -16)

Is this CFI correct?

> +       movq    %rbp, 8(%rsp)
> +       cfi_offset(rbp, -8)
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
> +L(SPECIAL_VALUES_LOOP):
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %ebp, %ebp
> +       bsfl    %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   40(%rsp, %rbp, 4), %xmm0
> +       call    tanhf@PLT
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, 24(%rsp, %rbp, 4)
> +
> +       leal    -1(%rbx), %eax
> +       andl    %eax, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +       /* All results have been written to 16(%rsp).  */
> +       movups  24(%rsp), %xmm0
> +       movq    (%rsp), %rbx
> +       cfi_restore(rbx)
> +       movq    8(%rsp), %rbp
> +       cfi_restore(rbp)
> +       addq    $56, %rsp
> +       ret
> +END(_ZGVbN4v_tanhf_sse4)
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-06-07 20:06 ` [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09  0:05 ` Noah Goldstein
  2022-06-09  0:05   ` [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
                     ` (6 more replies)
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
  2022-06-09 18:16 ` [PATCH v4 " Noah Goldstein
  8 siblings, 7 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:05 UTC (permalink / raw)
  To: libc-alpha

Improvementss are:
    1. Reduce code size (-64 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata size ([-128, -188] bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

        Function, New Time, Old Time, New / Old
_ZGVeN16v_atanhf,     1.39,    1.408,     0.987
---
 .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
 1 file changed, 244 insertions(+), 230 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index a1cd920a0f..3d808ac2bd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -31,53 +31,50 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512
- */
-#define Log_tbl_H			0
-#define Log_tbl_L			128
-#define One				256
-#define AbsMask				320
-#define AddB5				384
-#define RcpBitMask			448
-#define poly_coeff3			512
-#define poly_coeff2			576
-#define poly_coeff1			640
-#define poly_coeff0			704
-#define Half				768
-#define L2H				832
-#define L2L				896
+/* Offsets for data table __svml_satanh_data_internal_avx512 and
+   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
+   function. On cold-starts this might help the prefetcher. Possibly
+   a better idea is to interleave start/end so that the prefetcher is
+   less likely to detect a stream and pull irrelivant lines into
+   cache.  */
+
+/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
+   the memory is broadcast to {1to16}.  */
+#define AbsMask				0
+
+/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
+   is used here.  */
+#define One				0
+#define AddB5				64
+#define RcpBitMask			128
+#define Log_tbl_L_lo			192
+#define Log_tbl_L_hi			256
+#define Log_tbl_H_lo			320
+#define Log_tbl_H_hi			384
+#define L2H				448
+#define L2L				512
+#define poly_coeff3			576
+#define poly_coeff2			640
+#define poly_coeff1			704
 
 #include <sysdep.h>
 
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal_avx512_al64)
+
 	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanhf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	vmovups	One+__svml_satanh_data_internal_avx512(%rip), %zmm4
-
-	/* round reciprocals to 1+5b mantissas */
-	vmovups	AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
-	vmovups	RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
-	vmovaps	%zmm0, %zmm11
-	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
+	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
+	vmovups	ATANHF_DATA(One)(%rip), %zmm4
 
 	/* 1+y */
 	vaddps	{rn-sae}, %zmm4, %zmm6, %zmm9
 
 	/* 1-y */
 	vsubps	{rn-sae}, %zmm6, %zmm4, %zmm8
-	vxorps	%zmm6, %zmm11, %zmm10
-
-	/* Yp_high */
-	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
 
-	/* -Ym_high */
-	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+	/* round reciprocals to 1+5b mantissas */
+	vmovups	ATANHF_DATA(AddB5)(%rip), %zmm14
+	vmovups	ATANHF_DATA(RcpBitMask)(%rip), %zmm1
 
 	/* RcpP ~ 1/Yp */
 	vrcp14ps %zmm9, %zmm12
@@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	/* RcpM ~ 1/Ym */
 	vrcp14ps %zmm8, %zmm13
 
+	/* Yp_high */
+	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
+
+	/* -Ym_high */
+	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+
+
 	/* input outside (-1, 1) ? */
-	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
 	vpaddd	%zmm14, %zmm12, %zmm15
-	vpaddd	%zmm14, %zmm13, %zmm0
+	vpaddd	%zmm14, %zmm13, %zmm12
 
 	/* Yp_low */
 	vsubps	{rn-sae}, %zmm2, %zmm6, %zmm3
 	vandps	%zmm1, %zmm15, %zmm7
-	vandps	%zmm1, %zmm0, %zmm12
+	vandps	%zmm1, %zmm12, %zmm12
 
 	/* Ym_low */
 	vaddps	{rn-sae}, %zmm5, %zmm6, %zmm5
@@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
 
 	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
-	vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
-	vmovups	Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
-	vmovups	Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
+
+	vmovups	ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
+	vmovups	ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
 
 	/* exponents */
-	vgetexpps {sae}, %zmm7, %zmm15
 	vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
+	vgetexpps {sae}, %zmm7, %zmm15
+
 
 	/* Table lookups */
-	vmovups	__svml_satanh_data_internal_avx512(%rip), %zmm6
+	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
 	vgetexpps {sae}, %zmm12, %zmm14
-	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
+
 
 	/* Prepare table index */
 	vpsrld	$18, %zmm7, %zmm3
 	vpsrld	$18, %zmm12, %zmm2
-	vmovups	Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
-
+	vmovups	ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
+	vmovups	ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
 	/* Km-Kp */
+
+	vmovaps	%zmm3, %zmm5
+	vpermi2ps %zmm13, %zmm10, %zmm3
+	vpermt2ps %zmm13, %zmm2, %zmm10
+	vpermi2ps %zmm7, %zmm11, %zmm5
+	vpermt2ps %zmm7, %zmm2, %zmm11
 	vsubps	{rn-sae}, %zmm15, %zmm14, %zmm1
-	kmovw	%k0, %edx
-	vmovaps	%zmm3, %zmm0
-	vpermi2ps %zmm13, %zmm8, %zmm3
-	vpermt2ps %zmm13, %zmm2, %zmm8
-	vpermi2ps %zmm7, %zmm6, %zmm0
-	vpermt2ps %zmm7, %zmm2, %zmm6
-	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm5
+	vsubps	{rn-sae}, %zmm3, %zmm10, %zmm7
 
 	/* K*L2H + Th */
-	vmovups	L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
+	vmovups	ATANHF_DATA(L2H)(%rip), %zmm2
 
 	/* K*L2L + Tl */
-	vmovups	L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
-
-	/* polynomials */
-	vmovups	poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vmovups	ATANHF_DATA(L2L)(%rip), %zmm3
 
 	/* table values */
-	vsubps	{rn-sae}, %zmm0, %zmm6, %zmm0
-	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
-	vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
-	vmovups	poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
-	vmovaps	%zmm3, %zmm2
-	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
-	vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
-	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
-	vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
+	vsubps	{rn-sae}, %zmm5, %zmm11, %zmm5
+	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
+	vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
+	/* polynomials */
+	vmovups	ATANHF_DATA(poly_coeff3)(%rip), %zmm7
+	vmovups	ATANHF_DATA(poly_coeff2)(%rip), %zmm10
+	vmovaps	%zmm10, %zmm14
+	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
+	vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
+	vmovups	ATANHF_DATA(poly_coeff1)(%rip), %zmm12
+	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
+	vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
 
 	/* (K*L2L + Tl) + Rp*PolyP */
-	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
-	vorps	Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
+	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
+
+	/* zmm12 = zmm12 & (zmm4 | zmm0).  */
+	vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
 
 	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
-	vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
-	vaddps	{rn-sae}, %zmm3, %zmm0, %zmm4
-	vmulps	{rn-sae}, %zmm9, %zmm4, %zmm0
+	vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
+	vaddps	{rn-sae}, %zmm14, %zmm10, %zmm8
+
+	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
+	kmovw	%k0, %edx
 	testl	%edx, %edx
 
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
+	# LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm0
 
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	/* No register to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   moreso than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm11, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm1
+	vmovaps	%zmm1, (%rsp)
+	vmovaps	%zmm0, 64(%rsp)
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
-
-	movss	%xmm0, 128(%rsp, %r14, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVeN16v_atanhf_skx)
 
 	.section .rodata, "a"
-	.align	64
-
+	.align	4
 #ifdef __svml_satanh_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 Log_tbl_H[32][1];
-	__declspec(align(64)) VUINT32 Log_tbl_L[32][1];
+typedef struct{
+	__declspec(align(4)) VUINT32 AbsMask[1][1];
 	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 AbsMask[16][1];
 	__declspec(align(64)) VUINT32 AddB5[16][1];
 	__declspec(align(64)) VUINT32 RcpBitMask[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
+	__declspec(align(64)) VUINT32 L2H[16][1];
+	__declspec(align(64)) VUINT32 L2L[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff3[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff2[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff1[16][1];
-	__declspec(align(64)) VUINT32 poly_coeff0[16][1];
-	__declspec(align(64)) VUINT32 Half[16][1];
-	__declspec(align(64)) VUINT32 L2H[16][1];
-	__declspec(align(64)) VUINT32 L2L[16][1];
 } __svml_satanh_data_internal_avx512;
 #endif
 __svml_satanh_data_internal_avx512:
-	/* Log_tbl_H */
-	.long	0x00000000
-	.long	0x3cfc0000
-	.long	0x3d780000
-	.long	0x3db78000
-	.long	0x3df10000
-	.long	0x3e14c000
-	.long	0x3e300000
-	.long	0x3e4a8000
-	.long	0x3e648000
-	.long	0x3e7dc000
-	.long	0x3e8b4000
-	.long	0x3e974000
-	.long	0x3ea30000
-	.long	0x3eae8000
-	.long	0x3eb9c000
-	.long	0x3ec4e000
-	.long	0x3ecfa000
-	.long	0x3eda2000
-	.long	0x3ee48000
-	.long	0x3eeea000
-	.long	0x3ef8a000
-	.long	0x3f013000
-	.long	0x3f05f000
-	.long	0x3f0aa000
-	.long	0x3f0f4000
-	.long	0x3f13d000
-	.long	0x3f184000
-	.long	0x3f1ca000
-	.long	0x3f20f000
-	.long	0x3f252000
-	.long	0x3f295000
-	.long	0x3f2d7000
-	/* Log_tbl_L */
+	/* Leave this at front so we can potentially save space due to
+	   smaller alignment constraint.  */
+	.align	4
+    /* AbsMask */
+	.long	0x7fffffff
+	.align	64
+__svml_satanh_data_internal_avx512_al64:
+	/* One */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* AddB5 */
+	.align	64
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	/* RcpBitMask */
+	.align	64
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	/* Log_tbl_L_lo */
 	.align	64
 	.long	0x00000000
 	.long	0x3726c39e
@@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38dedfac
 	.long	0x38ebfb5e
 	.long	0xb8e63c9f
+	/* Log_tbl_L_hi */
+	.align	64
 	.long	0xb85c1340
 	.long	0x38777bcd
 	.long	0xb6038656
@@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38f85db0
 	.long	0x37b4996f
 	.long	0xb8bfb3ca
-	/* One */
+	/* Log_tbl_H_lo */
 	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* AbsMask */
+	.long	0x00000000
+	.long	0x3cfc0000
+	.long	0x3d780000
+	.long	0x3db78000
+	.long	0x3df10000
+	.long	0x3e14c000
+	.long	0x3e300000
+	.long	0x3e4a8000
+	.long	0x3e648000
+	.long	0x3e7dc000
+	.long	0x3e8b4000
+	.long	0x3e974000
+	.long	0x3ea30000
+	.long	0x3eae8000
+	.long	0x3eb9c000
+	.long	0x3ec4e000
+	/* Log_tbl_H_hi */
 	.align	64
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* AddB5 */
+	.long	0x3ecfa000
+	.long	0x3eda2000
+	.long	0x3ee48000
+	.long	0x3eeea000
+	.long	0x3ef8a000
+	.long	0x3f013000
+	.long	0x3f05f000
+	.long	0x3f0aa000
+	.long	0x3f0f4000
+	.long	0x3f13d000
+	.long	0x3f184000
+	.long	0x3f1ca000
+	.long	0x3f20f000
+	.long	0x3f252000
+	.long	0x3f295000
+	.long	0x3f2d7000
+	/* L2H = log(2)_high */
 	.align	64
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
-	/* RcpBitMask */
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	/* L2L = log(2)_low */
 	.align	64
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
 	/* poly_coeff3 */
 	.align	64
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
 	/* poly_coeff2 */
 	.align	64
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
 	/* poly_coeff1 */
 	.align	64
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	/* poly_coeff0 */
-	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* Half */
-	.align	64
-	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
-	/* L2H = log(2)_high */
-	.align	64
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	/* L2L = log(2)_low */
-	.align	64
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
 	.align	64
+	.type	__svml_satanh_data_internal_avx512_al64, @object
+	.size	__svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
 	.type	__svml_satanh_data_internal_avx512, @object
 	.size	__svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
@ 2022-06-09  0:05   ` Noah Goldstein
  2022-06-09 16:01     ` H.J. Lu
  2022-06-09  0:05   ` [PATCH v2 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:05 UTC (permalink / raw)
  To: libc-alpha

Improvements are:
    1. Reduce code size (-60 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Shrink rodata usage (-32 bytes).

The throughput improvement is not that significant (3-5%) as the
port 0 bottleneck is unavoidable.

       Function, New Time, Old Time, New / Old
_ZGVdN8v_atanhf,    2.799,    2.923,     0.958
---
 .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 405 +++++++++---------
 1 file changed, 202 insertions(+), 203 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index c1ea1c3353..6113d366c2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -30,305 +30,304 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
+/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might hhelp the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
 #define SgnMask				0
 #define sOne				32
-#define sPoly				64
-#define iBrkValue			320
-#define iOffExpoMask			352
-#define sHalf				384
-#define sSign				416
-#define sTopMask12			448
-#define TinyRange			480
-#define sLn2				512
+#define sTopMask12			64
+#define TinyRange			96
+#define iBrkValue			128
+#define iOffExpoMask			160
+#define sPoly				192
+#define sLn2				448
+#define sHalf				480
 
 #include <sysdep.h>
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_atanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	subq	$96, %rsp
-
+	/* Strip off the sign, so treat X as positive until right at the end */
+	vmovaps	ATANHF_DATA(SgnMask)(%rip), %ymm2
+	vandps	%ymm2, %ymm0, %ymm3
 	/* Load constants including One = 1 */
-	vmovups	sOne+__svml_satanh_data_internal(%rip), %ymm5
-	vmovups	sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
-	vmovaps	%ymm0, %ymm6
+	vmovups	ATANHF_DATA(sOne)(%rip), %ymm5
+	vsubps	%ymm3, %ymm5, %ymm1
+	vmovups	ATANHF_DATA(sTopMask12)(%rip), %ymm4
 
-	/* Strip off the sign, so treat X as positive until right at the end */
-	vandps	SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
-	vsubps	%ymm10, %ymm5, %ymm1
+	vrcpps	%ymm1, %ymm7
+	vsubps	%ymm1, %ymm5, %ymm9
+	vandps	%ymm4, %ymm7, %ymm6
+	vsubps	%ymm3, %ymm9, %ymm7
 
-	/*
-	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
-	 * the upper part UHi being <= 12 bits long. Then we have
-	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
-	 */
-	vaddps	%ymm10, %ymm10, %ymm14
+	/* No need to split sU when FMA is available */
+	vfnmadd213ps %ymm5, %ymm6, %ymm1
+	vmovaps	%ymm0, %ymm8
+	vfmadd213ps %ymm0, %ymm0, %ymm0
+	vfnmadd231ps %ymm6, %ymm7, %ymm1
 
 	/*
 	 * Check whether |X| < 1, in which case we use the main function.
 	 * Otherwise set the rangemask so that the callout will get used.
 	 * Note that this will also use the callout for NaNs since not(NaN < 1).
 	 */
-	vcmpnlt_uqps %ymm5, %ymm10, %ymm7
-	vsubps	%ymm1, %ymm5, %ymm9
-	vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
-	vrcpps	%ymm1, %ymm11
-	vsubps	%ymm10, %ymm9, %ymm12
-	vandps	%ymm13, %ymm11, %ymm0
+	vcmpnlt_uqps %ymm5, %ymm3, %ymm14
+	vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
 
-	/* No need to split sU when FMA is available */
-	vfnmadd213ps %ymm5, %ymm0, %ymm1
-	vmovaps	%ymm6, %ymm8
-	vfmadd213ps %ymm6, %ymm6, %ymm8
-	vfnmadd231ps %ymm0, %ymm12, %ymm1
+	/*
+	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+	 * the upper part UHi being <= 12 bits long. Then we have
+	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
+	 */
+	vaddps	%ymm3, %ymm3, %ymm3
 
 	/*
 	 * Split V as well into upper 12 bits and lower part, so that we can get
 	 * a preliminary quotient estimate without rounding error.
 	 */
-	vandps	%ymm13, %ymm14, %ymm15
-	vmovmskps %ymm7, %edx
-	vsubps	%ymm15, %ymm14, %ymm7
+	vandps	%ymm4, %ymm3, %ymm4
+	vsubps	%ymm4, %ymm3, %ymm7
 
 	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-	vmulps	%ymm15, %ymm0, %ymm10
+	vmulps	%ymm4, %ymm6, %ymm4
 
 	/* Compute D = E + E^2 */
 	vfmadd213ps %ymm1, %ymm1, %ymm1
 
-	/* Record the sign for eventual reincorporation. */
-	vandps	sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
+	/* Record the sign for eventual reincorporation.  */
+	vandnps	%ymm8, %ymm2, %ymm3
 
 	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-	vorps	%ymm3, %ymm8, %ymm2
-	vmulps	%ymm7, %ymm0, %ymm8
+	vorps	%ymm3, %ymm0, %ymm13
+	vmulps	%ymm7, %ymm6, %ymm2
 
 	/*
 	 * Compute R * (VHi + VLo) * (1 + E + E^2)
 	 * = R *  (VHi + VLo) * (1 + D)
 	 * = QHi + (QHi * D + QLo + QLo * D)
 	 */
-	vmulps	%ymm1, %ymm10, %ymm9
-	vfmadd213ps %ymm8, %ymm8, %ymm1
-	vaddps	%ymm1, %ymm9, %ymm1
 
-	/* reduction: compute r, n */
-	vmovups	iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
+	/*
+	 * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
+	 * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
+	 * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
+	 */
+	vmulps	%ymm1, %ymm4, %ymm6
+	vfmadd213ps %ymm2, %ymm2, %ymm1
+	vaddps	%ymm1, %ymm6, %ymm1
 
 	/*
 	 * Now finally accumulate the high and low parts of the
 	 * argument to log1p, H + L, with a final compensated summation.
 	 */
-	vaddps	%ymm1, %ymm10, %ymm12
-	vsubps	%ymm12, %ymm10, %ymm11
+	vaddps	%ymm1, %ymm4, %ymm2
+
+	/* reduction: compute r, n */
+	vmovups	ATANHF_DATA(iBrkValue)(%rip), %ymm9
 
 	/*
 	 * Now we feed into the log1p code, using H in place of _VARG1 and
 	 * later incorporating L into the reduced argument.
 	 * compute 1+x as high, low parts
 	 */
-	vmaxps	%ymm12, %ymm5, %ymm13
-	vminps	%ymm12, %ymm5, %ymm14
-	vaddps	%ymm11, %ymm1, %ymm0
-	vaddps	%ymm14, %ymm13, %ymm1
-	vpsubd	%ymm9, %ymm1, %ymm7
-	vsubps	%ymm1, %ymm13, %ymm15
-	vpsrad	$23, %ymm7, %ymm10
-	vpand	iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
-	vaddps	%ymm15, %ymm14, %ymm13
-	vpslld	$23, %ymm10, %ymm11
-	vpaddd	%ymm9, %ymm8, %ymm15
-	vaddps	%ymm13, %ymm0, %ymm14
-	vcvtdq2ps %ymm10, %ymm0
-	vpsubd	%ymm11, %ymm5, %ymm12
+	vmaxps	%ymm2, %ymm5, %ymm0
+	vminps	%ymm2, %ymm5, %ymm6
+
+	/* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
+	vsubps	%ymm2, %ymm4, %ymm2
+	vaddps	%ymm6, %ymm0, %ymm4
+	vpsubd	%ymm9, %ymm4, %ymm7
+	vsubps	%ymm4, %ymm0, %ymm4
+	vaddps	%ymm2, %ymm1, %ymm2
+	vmovaps	ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
+
+	vandps	%ymm1, %ymm7, %ymm0
+	vaddps	%ymm4, %ymm6, %ymm4
+	vandnps	%ymm7, %ymm1, %ymm6
+	vmovups	ATANHF_DATA(sPoly+0)(%rip), %ymm1
+	vpaddd	%ymm9, %ymm0, %ymm0
+	vaddps	%ymm4, %ymm2, %ymm4
+	vpsubd	%ymm6, %ymm5, %ymm6
 
 	/* polynomial evaluation */
-	vsubps	%ymm5, %ymm15, %ymm5
-	vmulps	%ymm14, %ymm12, %ymm1
-	vaddps	%ymm5, %ymm1, %ymm5
-	vmovups	sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
-	vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vmulps	%ymm1, %ymm5, %ymm7
-	vfmadd213ps %ymm5, %ymm5, %ymm7
+	vsubps	%ymm5, %ymm0, %ymm2
+	vfmadd231ps %ymm4, %ymm6, %ymm2
+	vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
+
+	vmulps	%ymm1, %ymm2, %ymm1
+	vfmadd213ps %ymm2, %ymm2, %ymm1
 
 	/* final reconstruction */
-	vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
+	vpsrad	$23, %ymm7, %ymm6
+	vcvtdq2ps %ymm6, %ymm2
+	vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
 
 	/* Finally, halve the result and reincorporate the sign */
-	vxorps	sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
-	vmulps	%ymm0, %ymm3, %ymm0
-	vblendvps %ymm4, %ymm2, %ymm0, %ymm0
+	vxorps	ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
+	vmulps	%ymm2, %ymm3, %ymm2
+	vmovmskps %ymm14, %edx
 	testl	%edx, %edx
 
+	vblendvps %ymm15, %ymm13, %ymm2, %ymm0
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
-
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	# LOE rbx rdx r12 r13 r14 r15 ymm0
+	/* No registers to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   moreso than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm6, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx ymm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
+	/* Save all already computed inputs.  */
+	vmovups	%ymm0, (%rsp)
+	/* Save origional input (ymm8 unchanged up to this point).  */
+	vmovups	%ymm8, 32(%rsp)
 
-	/* Special inputs
-	 * processing loop
-	 */
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* All results have been written to 32(%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVdN8v_atanhf_avx2)
 
 	.section .rodata, "a"
 	.align	32
-
 #ifdef __svml_satanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
+typedef struct{
 	__declspec(align(32)) VUINT32 SgnMask[8][1];
 	__declspec(align(32)) VUINT32 sOne[8][1];
-	__declspec(align(32)) VUINT32 sPoly[8][8][1];
-	__declspec(align(32)) VUINT32 iBrkValue[8][1];
-	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-	__declspec(align(32)) VUINT32 sHalf[8][1];
-	__declspec(align(32)) VUINT32 sSign[8][1];
 	__declspec(align(32)) VUINT32 sTopMask12[8][1];
 	__declspec(align(32)) VUINT32 TinyRange[8][1];
+	__declspec(align(32)) VUINT32 iBrkValue[8][1];
+	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
+	__declspec(align(32)) VUINT32 sPoly[8][8][1];
 	__declspec(align(32)) VUINT32 sLn2[8][1];
+	__declspec(align(32)) VUINT32 sHalf[8][1];
 } __svml_satanh_data_internal;
 #endif
 __svml_satanh_data_internal:
 	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 	/* sOne = SP 1.0 */
 	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sPoly[] = SP polynomial */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* sTopMask12 */
+	.align	32
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	/* TinyRange */
 	.align	32
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
 	/* iBrkValue = SP 2/3 */
 	.align	32
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 	/* iOffExpoMask = SP significand mask */
 	.align	32
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sHalf */
-	.align	32
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	/* sSign */
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
-	/* sTopMask12 */
-	.align	32
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	/* TinyRange */
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	/* sPoly[] = SP polynomial */
 	.align	32
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
 	/* sLn2 = SP ln(2) */
 	.align	32
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	/* sHalf */
+	.align	32
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
 	.align	32
 	.type	__svml_satanh_data_internal, @object
 	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2 3/7] x86: Improve svml_s_atanhf4_core_sse4.S
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
  2022-06-09  0:05   ` [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09  0:05   ` Noah Goldstein
  2022-06-09 16:03     ` H.J. Lu
  2022-06-09  0:05   ` [PATCH v2 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:05 UTC (permalink / raw)
  To: libc-alpha

Improvements are:
    1. Reduce code size (-62 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata usage (-16 bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

       Function, New Time, Old Time, New / Old
_ZGVbN4v_atanhf,    8.821,    8.903,     0.991
---
 .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 378 ++++++++----------
 1 file changed, 169 insertions(+), 209 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
index 2d3ad2617f..e6683785fb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
@@ -30,96 +30,80 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
-#define SgnMask				0
-#define sOne				16
-#define sPoly				32
-#define iBrkValue			160
-#define iOffExpoMask			176
-#define sHalf				192
-#define sSign				208
-#define sTopMask12			224
-#define TinyRange			240
-#define sLn2				256
+/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+#define sOne				0
+#define SgnMask				16
+#define sTopMask12			32
+#define iBrkValue			48
+#define iOffExpoMask			64
+#define sPoly				80
+#define sLn2				208
+#define TinyRange			224
 
 #include <sysdep.h>
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_atanhf_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
 	movaps	%xmm0, %xmm5
 
 	/* Load constants including One = 1 */
-	movups	sOne+__svml_satanh_data_internal(%rip), %xmm4
+	movups	ATANHF_DATA(sOne)(%rip), %xmm4
 	movaps	%xmm5, %xmm3
 
 	/* Strip off the sign, so treat X as positive until right at the end */
-	movups	SgnMask+__svml_satanh_data_internal(%rip), %xmm7
-	movaps	%xmm4, %xmm8
-	andps	%xmm5, %xmm7
+	movups	ATANHF_DATA(SgnMask)(%rip), %xmm1
+	movaps	%xmm4, %xmm2
+	andps	%xmm1, %xmm0
 	movaps	%xmm4, %xmm10
-	movups	sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
+	movups	ATANHF_DATA(sTopMask12)(%rip), %xmm11
 	movaps	%xmm4, %xmm14
 	movaps	%xmm11, %xmm9
 
+
 	/*
 	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
 	 * the upper part UHi being <= 12 bits long. Then we have
 	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
 	 */
-	movaps	%xmm7, %xmm12
+	movaps	%xmm0, %xmm6
+	mulps	%xmm5, %xmm3
+	subps	%xmm0, %xmm2
+	addps	%xmm0, %xmm6
+	subps	%xmm2, %xmm10
+	addps	%xmm5, %xmm3
+	subps	%xmm0, %xmm10
+	andps	%xmm2, %xmm9
+
 
 	/*
 	 * Check whether |X| < 1, in which case we use the main function.
 	 * Otherwise set the rangemask so that the callout will get used.
 	 * Note that this will also use the callout for NaNs since not(NaN < 1).
 	 */
-	movaps	%xmm7, %xmm6
-	movaps	%xmm7, %xmm2
-	cmpnltps %xmm4, %xmm6
-	cmpltps	TinyRange+__svml_satanh_data_internal(%rip), %xmm2
-	mulps	%xmm5, %xmm3
-	subps	%xmm7, %xmm8
-	addps	%xmm7, %xmm12
-	movmskps %xmm6, %edx
-	subps	%xmm8, %xmm10
-	addps	%xmm5, %xmm3
-	subps	%xmm7, %xmm10
-	andps	%xmm8, %xmm9
+	rcpps	%xmm9, %xmm7
+	subps	%xmm9, %xmm2
+	andps	%xmm11, %xmm7
 
-	/*
-	 * Now we feed into the log1p code, using H in place of _VARG1 and
-	 * later incorporating L into the reduced argument.
-	 * compute 1+x as high, low parts
-	 */
-	movaps	%xmm4, %xmm7
-
-	/*
-	 * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
-	 * The first FMR is exact (we force R to 12 bits just in case it
-	 * isn't already, to make absolutely sure), and since E is ~ 2^-12,
-	 * the rounding error in the other one is acceptable.
-	 */
-	rcpps	%xmm9, %xmm15
-	subps	%xmm9, %xmm8
-	andps	%xmm11, %xmm15
 
 	/*
 	 * Split V as well into upper 12 bits and lower part, so that we can get
 	 * a preliminary quotient estimate without rounding error.
 	 */
-	andps	%xmm12, %xmm11
-	mulps	%xmm15, %xmm9
-	addps	%xmm8, %xmm10
-	subps	%xmm11, %xmm12
+	andps	%xmm6, %xmm11
+	mulps	%xmm7, %xmm9
+	addps	%xmm2, %xmm10
+	subps	%xmm11, %xmm6
 
 	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-	mulps	%xmm15, %xmm11
-	mulps	%xmm15, %xmm10
+	mulps	%xmm7, %xmm11
+	mulps	%xmm7, %xmm10
 	subps	%xmm9, %xmm14
-	mulps	%xmm12, %xmm15
+	mulps	%xmm6, %xmm7
 	subps	%xmm10, %xmm14
 
 	/* Compute D = E + E^2 */
@@ -127,8 +111,8 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	movaps	%xmm4, %xmm8
 	mulps	%xmm14, %xmm13
 
-	/* reduction: compute r, n */
-	movdqu	iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
+	/* reduction: compute r,n */
+	movdqu	ATANHF_DATA(iBrkValue)(%rip), %xmm9
 	addps	%xmm13, %xmm14
 
 	/*
@@ -136,168 +120,149 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	 * = R *  (VHi + VLo) * (1 + D)
 	 * = QHi + (QHi * D + QLo + QLo * D)
 	 */
-	movaps	%xmm14, %xmm0
-	mulps	%xmm15, %xmm14
-	mulps	%xmm11, %xmm0
-	addps	%xmm14, %xmm15
-	movdqu	iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
+	movaps	%xmm14, %xmm2
+	mulps	%xmm7, %xmm14
+	mulps	%xmm11, %xmm2
+	addps	%xmm14, %xmm7
+	movdqu	ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
 	movaps	%xmm4, %xmm14
 
 	/* Record the sign for eventual reincorporation. */
-	movups	sSign+__svml_satanh_data_internal(%rip), %xmm1
-	addps	%xmm15, %xmm0
+	addps	%xmm7, %xmm2
+
 
 	/*
 	 * Now finally accumulate the high and low parts of the
 	 * argument to log1p, H + L, with a final compensated summation.
 	 */
-	movaps	%xmm0, %xmm6
-	andps	%xmm5, %xmm1
-
+	movaps	%xmm2, %xmm6
+	andnps	%xmm5, %xmm1
+	movaps	%xmm4, %xmm7
 	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-	orps	%xmm1, %xmm3
 	addps	%xmm11, %xmm6
 	maxps	%xmm6, %xmm7
 	minps	%xmm6, %xmm8
 	subps	%xmm6, %xmm11
 	movaps	%xmm7, %xmm10
-	andps	%xmm2, %xmm3
 	addps	%xmm8, %xmm10
-	addps	%xmm11, %xmm0
+	addps	%xmm11, %xmm2
 	subps	%xmm10, %xmm7
 	psubd	%xmm9, %xmm10
-	addps	%xmm7, %xmm8
+	addps	%xmm8, %xmm7
 	pand	%xmm10, %xmm12
 	psrad	$23, %xmm10
 	cvtdq2ps %xmm10, %xmm13
-	addps	%xmm8, %xmm0
+	addps	%xmm7, %xmm2
 
 	/* final reconstruction */
-	mulps	sLn2+__svml_satanh_data_internal(%rip), %xmm13
 	pslld	$23, %xmm10
 	paddd	%xmm9, %xmm12
 	psubd	%xmm10, %xmm14
 
 	/* polynomial evaluation */
 	subps	%xmm4, %xmm12
-	mulps	%xmm0, %xmm14
-	movups	sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
-	addps	%xmm12, %xmm14
-	mulps	%xmm14, %xmm0
+	mulps	%xmm14, %xmm2
+	movups	ATANHF_DATA(sPoly+0)(%rip), %xmm7
+	addps	%xmm12, %xmm2
+	mulps	%xmm2, %xmm7
+
 
 	/* Finally, halve the result and reincorporate the sign */
-	movups	sHalf+__svml_satanh_data_internal(%rip), %xmm4
-	pxor	%xmm1, %xmm4
-	addps	sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	mulps	%xmm14, %xmm0
-	addps	%xmm0, %xmm14
-	movaps	%xmm2, %xmm0
-	addps	%xmm13, %xmm14
-	mulps	%xmm14, %xmm4
-	andnps	%xmm4, %xmm0
-	orps	%xmm3, %xmm0
-	testl	%edx, %edx
+	addps	ATANHF_DATA(sPoly+16)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+32)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+48)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+64)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+80)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+96)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	movaps	ATANHF_DATA(sPoly+112)(%rip), %xmm6
+	addps	%xmm6, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	ATANHF_DATA(sLn2)(%rip), %xmm13
+	/* We can build `sHalf` with `sPoly & sOne`.  */
+	andps	%xmm4, %xmm6
+	orps	%xmm1, %xmm3
+	xorps	%xmm6, %xmm1
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
+	addps	%xmm2, %xmm7
+	addps	%xmm13, %xmm7
+	mulps	%xmm7, %xmm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	/* Finish check of NaNs.  */
+	cmpleps	%xmm0, %xmm4
+	movmskps %xmm4, %edx
+	cmpltps	ATANHF_DATA(TinyRange)(%rip), %xmm0
 
-L(EXIT):
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
+	andps	%xmm0, %xmm3
+	andnps	%xmm1, %xmm0
+	orps	%xmm3, %xmm0
+
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	/* No registers to restore on fast path.  */
 	ret
-	cfi_def_cfa_offset(80)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   moreso than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm5, 32(%rsp)
-	movups	%xmm0, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 edx
-
-	xorl	%eax, %eax
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
-
+	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+       call entry will be 16-byte aligned. */
+	subq	$56, %rsp
+	cfi_def_cfa_offset(64)
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm5, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+       encoding for many instructions (as compared with r12/r13). */
+	movq	%rbx, (%rsp)
+	cfi_offset(rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset(rbp, -56)
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
-
-	movss	%xmm0, 48(%rsp, %r14, 4)
-
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+	/* All results have been written to 16(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore(rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore(rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset(8)
+	ret
 END(_ZGVbN4v_atanhf_sse4)
 
 	.section .rodata, "a"
@@ -305,56 +270,51 @@ END(_ZGVbN4v_atanhf_sse4)
 
 #ifdef __svml_satanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 SgnMask[4][1];
+typedef struct{
 	__declspec(align(16)) VUINT32 sOne[4][1];
-	__declspec(align(16)) VUINT32 sPoly[8][4][1];
+	__declspec(align(16)) VUINT32 SgnMask[4][1];
+	__declspec(align(16)) VUINT32 sTopMask12[4][1];
 	__declspec(align(16)) VUINT32 iBrkValue[4][1];
 	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-	__declspec(align(16)) VUINT32 sHalf[4][1];
-	__declspec(align(16)) VUINT32 sSign[4][1];
-	__declspec(align(16)) VUINT32 sTopMask12[4][1];
-	__declspec(align(16)) VUINT32 TinyRange[4][1];
+	__declspec(align(16)) VUINT32 sPoly[8][4][1];
 	__declspec(align(16)) VUINT32 sLn2[4][1];
+	__declspec(align(16)) VUINT32 TinyRange[4][1];
 } __svml_satanh_data_internal;
 #endif
+
 __svml_satanh_data_internal:
-	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 	/* sOne = SP 1.0 */
 	.align	16
 	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sPoly[] = SP polynomial */
+	/* SgnMask */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* sTopMask12 */
 	.align	16
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
 	/* iBrkValue = SP 2/3 */
 	.align	16
 	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
+	/* iOffExpoMask = SP significand mask ==*/
 	.align	16
 	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sHalf */
-	.align	16
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	/* sSign */
+
+	/* sPoly[] = SP polynomial */
 	.align	16
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	/* sTopMask12 */
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
+
+	/* sLn2 = SP ln(2) */
 	.align	16
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 	/* TinyRange */
 	.align	16
 	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-	/* sLn2 = SP ln(2) */
-	.align	16
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 	.align	16
 	.type	__svml_satanh_data_internal, @object
 	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
  2022-06-09  0:05   ` [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
  2022-06-09  0:05   ` [PATCH v2 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09  0:05   ` Noah Goldstein
  2022-06-09 16:04     ` H.J. Lu
  2022-06-09  0:05   ` [PATCH v2 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:05 UTC (permalink / raw)
  To: libc-alpha

Optimizations are:
    1. Reduce code size (-67 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata usage (-448 bytes).

Result is roughly a 14% speedup:

       Function, New Time, Old Time, New / Old
_ZGVeN16v_tanhf,    0.649,    0.752,     0.863
---
 .../multiarch/svml_s_tanhf16_core_avx512.S    | 527 ++++++++++--------
 1 file changed, 287 insertions(+), 240 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index 5b1f9f151c..d55798767c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -70,310 +70,357 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
+/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+
+/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
  */
-#define _sC				0
-#define _sP0				128
-#define _sP2				256
-#define _sP3				384
-#define _sP4				512
-#define _sP5				640
-#define _sP6				768
-#define _sP7				896
-#define _iExpMantMask_UISA		1024
-#define _iMinIdxOfsMask_UISA		1088
-#define _iMaxIdxMask_UISA		1152
-#define _sSignMask			1216
-#define _sAbsMask			1280
-#define _iExpMantMask			1344
-#define _iExpMask			1408
-#define _iMinIdxOfsMask			1472
-#define _iMaxIdxMask			1536
+#define _iExpMantMask_UISA		0
+#define _iMinIdxOfsMask_UISA		4
+#define _iMaxIdxMask_UISA		8
+#define _iExpMask			12
+
+/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
+   each.  */
+#define _sC_lo				0
+#define _sC_hi				64
+#define _sP7_lo				128
+#define _sP7_hi				192
+#define _sSignMask			256
+#define _sP6_lo				320
+#define _sP6_hi				384
+#define _sP5_lo				448
+#define _sP5_hi				512
+#define _sP4_lo				576
+#define _sP4_hi				640
+#define _sP3_lo				704
+#define _sP3_hi				768
+#define _sP2_lo				832
+#define _sP2_hi				896
+#define _sP0_lo				960
+#define _sP0_hi				1024
 
 #include <sysdep.h>
+#define TANHF_DATA(x)			((x)+__svml_stanh_data_internal_al64)
+#define TANHF_DATA_UNALIGNED(x)		((x)+__svml_stanh_data_internal)
 
 	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_tanhf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	vmovaps	%zmm0, %zmm1
-	vmovups	__svml_stanh_data_internal(%rip), %zmm9
-	vmovups	_sP6+__svml_stanh_data_internal(%rip), %zmm11
-	vmovups	_sP5+__svml_stanh_data_internal(%rip), %zmm12
-	vmovups	_sP4+__svml_stanh_data_internal(%rip), %zmm13
-	vmovups	_sP3+__svml_stanh_data_internal(%rip), %zmm14
-	vmovups	_sP2+__svml_stanh_data_internal(%rip), %zmm15
-	vpternlogd $255, %zmm2, %zmm2, %zmm2
-	vandps	_sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
-	vandps	_sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
-
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpandd	_iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
-	vpsubd	_iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
-	vpcmpd	$2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
+	vpandd	TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
+	vpsubd	TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
 
-	/*
-	 *  small table specific variables *
-	 *  Constant loading
-	 */
-	vpxord	%zmm5, %zmm5, %zmm5
-
-	/* if VMIN, VMAX is defined for I type */
-	vpmaxsd	%zmm5, %zmm4, %zmm6
-	vpminsd	_iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
-	vpsrld	$21, %zmm7, %zmm10
-	vmovups	_sP7+__svml_stanh_data_internal(%rip), %zmm4
-	vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
-	vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
-	vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
-	vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
-	vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
-	vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
-	vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
-	vpandnd	%zmm3, %zmm3, %zmm2{%k1}
-	vptestmd %zmm2, %zmm2, %k0
-	vmovups	_sP0+__svml_stanh_data_internal(%rip), %zmm3
-	vsubps	{rn-sae}, %zmm9, %zmm8, %zmm2
-	kmovw	%k0, %edx
-	vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
-	vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
-	vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
-	vorps	%zmm0, %zmm4, %zmm0
-	testl	%edx, %edx
+	/* Selection arguments between [0, 0x03e00000] into zmm3.  */
+	vpxord	%zmm3, %zmm3, %zmm3
+	vpmaxsd	%zmm3, %zmm2, %zmm3
+	vpminsd	TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
+	/* Setup permute indices in zmm3.  */
+	vpsrld	$21, %zmm3, %zmm3
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	/* Store if there are any special cases in k1.  */
+	vpcmpd	$6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
 
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
+	vmovaps	TANHF_DATA(_sC_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vmovaps	TANHF_DATA(_sP7_lo)(%rip), %zmm2
+	vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm1, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
+	/* Store absolute values of inputs in zmm1.  */
+	vmovaps	TANHF_DATA(_sSignMask)(%rip), %zmm4
+	vandnps	%zmm0, %zmm4, %zmm1
+	vsubps	{rn-sae}, %zmm5, %zmm1, %zmm1
 
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
+	vmovaps	TANHF_DATA(_sP6_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
 
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	vmovaps	TANHF_DATA(_sP5_lo)(%rip), %zmm6
+	vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
+
+	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	vmovaps	TANHF_DATA(_sP4_lo)(%rip), %zmm7
+	vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
+	vmovaps	TANHF_DATA(_sP3_lo)(%rip), %zmm8
+	vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
 
-	/* Special inputs
-	 * processing loop
+	vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
+
+	vmovaps	TANHF_DATA(_sP2_lo)(%rip), %zmm9
+	vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
+
+	vmovaps	TANHF_DATA(_sP0_lo)(%rip), %zmm10
+	vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
+
+	vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
+
+	kmovw	%k1, %edx
+	testl	%edx, %edx
+
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
+	/* Wait until after branch of write over zmm0.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
+
+	/* No stack restoration on the fastpath.  */
+	ret
+
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   moreso than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
+
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+
+	/* Save origional input (zmm0 unchanged up to this point).  */
+	vmovaps	%zmm0, 64(%rsp)
+	/* Save all already computed inputs.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
+	vmovaps	%zmm0, (%rsp)
 
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVeN16v_tanhf_skx)
 
 	.section .rodata, "a"
-	.align	64
-
+	.align	16
 #ifdef __svml_stanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 _sC[32][1];
-	__declspec(align(64)) VUINT32 _sP0[32][1];
-	__declspec(align(64)) VUINT32 _sP2[32][1];
-	__declspec(align(64)) VUINT32 _sP3[32][1];
-	__declspec(align(64)) VUINT32 _sP4[32][1];
-	__declspec(align(64)) VUINT32 _sP5[32][1];
-	__declspec(align(64)) VUINT32 _sP6[32][1];
-	__declspec(align(64)) VUINT32 _sP7[32][1];
-	__declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
-	__declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
-	__declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
+typedef struct
+	{
+	__declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iExpMask[1][1];
+	__declspec(align(64)) VUINT32 _sC_lo[16][1];
+	__declspec(align(64)) VUINT32 _sC_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP7_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP7_hi[16][1];
 	__declspec(align(64)) VUINT32 _sSignMask[16][1];
-	__declspec(align(64)) VUINT32 _sAbsMask[16][1];
-	__declspec(align(64)) VUINT32 _iExpMantMask[16][1];
-	__declspec(align(64)) VUINT32 _iExpMask[16][1];
-	__declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
-	__declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
+	__declspec(align(64)) VUINT32 _sP6_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP6_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP5_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP5_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP4_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP4_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP3_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP3_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP2_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP2_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP0_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP0_hi[16][1];
 } __svml_stanh_data_internal;
 #endif
+
 __svml_stanh_data_internal:
-	/* _sC */
+	.align	4
+	/* _iExpMantMask_UISA */
+	.long	0x7fe00000
+
+	.align	4
+	/* _iMinIdxOfsMask_UISA */
+	.long	0x3d400000
+
+	.align	4
+	/* _iMaxIdxMask_UISA */
+	.long	0x03e00000
+
+	.align	4
+	/* _iExpMask */
+	.long	0x7f000000
+
+	.align	64
+__svml_stanh_data_internal_al64:
+	.align	64
+	/* _sC_lo */
 	.long	0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
 	.long	0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
 	.long	0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
 	.long	0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
+
+	.align	64
+	/* _sC_hi */
 	.long	0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
 	.long	0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
 	.long	0x40500000, 0x40700000, 0x40900000, 0x40b00000
 	.long	0x40d00000, 0x40f00000, 0x41100000, 0x00000000
-	/* p0 */
-	.align	64
-	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
-	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
-	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
-	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
-	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
-	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
-	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
-	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
-	/* p2 */
-	.align	64
-	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
-	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
-	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
-	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
-	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
-	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
-	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
-	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
-	/* p3 */
+
 	.align	64
-	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
-	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
-	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
-	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
-	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
-	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
-	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
-	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
-	/* p4 */
+	/* _sP7_lo */
+	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
+	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
+	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
+	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
+
 	.align	64
-	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
-	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
-	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
-	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
-	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
-	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
-	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
-	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
-	/* p5 */
+	/* _sP7_hi */
+	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
+	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
+	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
+	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
+
 	.align	64
-	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
-	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
-	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
-	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
-	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
-	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
-	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
-	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
-	/* p6 */
+	/* _sSignMask */
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+
 	.align	64
+	/* _sP6_lo */
 	.long	0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
 	.long	0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
 	.long	0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
 	.long	0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
+
+	.align	64
+	/* _sP6_hi */
 	.long	0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
 	.long	0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
 	.long	0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
 	.long	0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
-	/* p7 */
+
 	.align	64
-	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
-	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
-	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
-	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
-	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
-	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
-	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
-	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
+	/* _sP5_lo */
+	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
+	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
+	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
+	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
+
 	.align	64
-	.long	0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMantMask_UISA */
+	/* _sP5_hi */
+	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
+	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
+	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
+	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
+
 	.align	64
-	.long	0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000 /* _iMinIdxOfsMask_UISA */
+	/* _sP4_lo */
+	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
+	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
+	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
+	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
+
 	.align	64
-	.long	0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000 /* _iMaxIdxMask_UISA */
+	/* _sP4_hi */
+	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
+	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
+	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
+	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
+
 	.align	64
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
+	/* _sP3_lo */
+	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
+	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
+	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
+	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
+
 	.align	64
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
+	/* _sP3_hi */
+	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
+	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
+	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
+	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
+
 	.align	64
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
+	/* _sP2_lo */
+	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
+	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
+	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
+	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
+
 	.align	64
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
+	/* _sP2_hi */
+	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
+	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
+	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
+	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
+
 	.align	64
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
+	/* _sP0_lo */
+	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
+	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
+	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
+	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
+
 	.align	64
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
+	/* _sP0_hi */
+	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
+	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
+	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
+	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
+
 	.align	64
+	.type	__svml_stanh_data_internal_al64, @object
+	.size	__svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
 	.type	__svml_stanh_data_internal, @object
 	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-06-09  0:05   ` [PATCH v2 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
@ 2022-06-09  0:05   ` Noah Goldstein
  2022-06-09 16:05     ` H.J. Lu
  2022-06-09  0:05   ` [PATCH v2 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:05 UTC (permalink / raw)
  To: libc-alpha

tanhf-avx2 and tanhf-sse4 use the same data tables so we can save
over 4kb using a shared datatable. This does increase the memory
footprint of the sse4 version (as now all the targets are 32 bytes
instead of 16), generally it seems worth the code size save.

NB: This patch doesn't do anything itself, it is setup for future
patches.
---
 .../fpu/multiarch/svml_s_tanhf_rodata.S       | 621 ++++++++++++++++++
 1 file changed, 621 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
new file mode 100644
index 0000000000..904fe5f588
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
@@ -0,0 +1,621 @@
+/* Datatables for  tanhf AVX2 and tanhf SSE4.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/* Offsets are ordered by use in the function. On cold-starts this
+   might help the prefetcher. If the streaming prefetchers kick in it
+   will prefetch into the lookup table.  */
+#define _iExpMantMask			0
+#define _iMinIdxOfsMask			32
+#define _iMaxIdxMask			64
+#define _sAbsMask			96
+#define _iExpMask			128
+#define _lookupTable			160
+
+#define TANHF_DATA(offset)		((offset)+__svml_stanh_data_internal_avx2)
+#ifndef ONLY_DECL_OFFSET
+	.section .rodata, "a"
+	.align	32
+
+# ifdef __svml_stanh_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct
+	{
+	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
+	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
+	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
+	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
+	__declspec(align(32)) VUINT32 _iExpMask[8][1];
+	__declspec(align(32)) VUINT32 _lookupTable[(134*4)][2];
+} __svml_stanh_data_internal;
+# endif
+
+
+__svml_stanh_data_internal:
+	.globl	__svml_stanh_data_internal_avx2
+__svml_stanh_data_internal_avx2:
+	.align	32
+	/* _iExpMantMask.  */
+	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
+	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
+
+	.align	32
+	/* _iMinIdxOfsMask.  */
+	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
+	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
+
+	.align	32
+	/* _iMaxIdxMask.  */
+	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
+	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
+
+	.align	32
+	/* _sAbsMask.  */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+
+	.align	32
+	/* _iExpMask.  */
+	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
+	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
+
+	.align	32
+	/* _lookupTable.  */
+	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500].  */
+	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01.  */
+	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00.  */
+	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06.  */
+	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01.  */
+	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08.  */
+	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00.  */
+	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05.  */
+	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01.  */
+	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08.  */
+	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00.  */
+	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04.  */
+	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01.  */
+	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08.  */
+	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00.  */
+	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04.  */
+	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01.  */
+	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08.  */
+	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00.  */
+	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04.  */
+	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01.  */
+	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08.  */
+	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00.  */
+	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04.  */
+	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01.  */
+	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08.  */
+	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00.  */
+	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04.  */
+	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01.  */
+	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08.  */
+	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00.  */
+	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04.  */
+	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01.  */
+	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07.  */
+	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00.  */
+	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04.  */
+	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01.  */
+	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07.  */
+	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00.  */
+	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04.  */
+	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01.  */
+	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07.  */
+	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00.  */
+	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04.  */
+	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01.  */
+	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07.  */
+	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00.  */
+	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04.  */
+	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01.  */
+	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07.  */
+	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00.  */
+	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04.  */
+	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01.  */
+	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07.  */
+	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00.  */
+	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04.  */
+	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01.  */
+	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07.  */
+	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00.  */
+	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04.  */
+	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01.  */
+	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07.  */
+	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00.  */
+	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04.  */
+	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01.  */
+	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07.  */
+	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00.  */
+	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04.  */
+	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01.  */
+	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07.  */
+	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00.  */
+	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04.  */
+	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01.  */
+	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07.  */
+	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00.  */
+	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04.  */
+	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01.  */
+	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06.  */
+	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00.  */
+	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04.  */
+	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01.  */
+	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06.  */
+	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00.  */
+	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03.  */
+	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01.  */
+	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06.  */
+	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00.  */
+	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03.  */
+	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01.  */
+	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06.  */
+	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00.  */
+	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03.  */
+	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01.  */
+	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06.  */
+	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00.  */
+	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03.  */
+	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01.  */
+	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06.  */
+	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00.  */
+	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03.  */
+	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01.  */
+	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06.  */
+	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00.  */
+	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03.  */
+	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01.  */
+	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06.  */
+	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00.  */
+	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03.  */
+	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01.  */
+	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06.  */
+	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00.  */
+	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03.  */
+	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01.  */
+	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06.  */
+	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00.  */
+	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03.  */
+	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01.  */
+	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06.  */
+	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00.  */
+	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03.  */
+	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01.  */
+	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05.  */
+	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00.  */
+	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03.  */
+	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01.  */
+	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05.  */
+	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00.  */
+	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03.  */
+	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01.  */
+	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05.  */
+	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00.  */
+	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03.  */
+	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01.  */
+	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05.  */
+	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00.  */
+	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03.  */
+	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01.  */
+	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05.  */
+	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00.  */
+	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03.  */
+	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01.  */
+	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05.  */
+	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00.  */
+	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03.  */
+	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01.  */
+	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05.  */
+	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00.  */
+	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03.  */
+	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01.  */
+	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05.  */
+	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00.  */
+	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02.  */
+	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01.  */
+	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05.  */
+	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00.  */
+	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02.  */
+	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01.  */
+	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05.  */
+	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00.  */
+	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02.  */
+	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01.  */
+	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04.  */
+	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00.  */
+	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02.  */
+	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01.  */
+	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04.  */
+	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00.  */
+	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02.  */
+	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01.  */
+	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04.  */
+	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00.  */
+	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02.  */
+	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01.  */
+	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04.  */
+	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00.  */
+	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02.  */
+	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01.  */
+	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04.  */
+	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00.  */
+	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02.  */
+	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01.  */
+	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04.  */
+	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00.  */
+	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02.  */
+	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01.  */
+	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04.  */
+	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00.  */
+	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02.  */
+	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01.  */
+	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04.  */
+	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00.  */
+	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02.  */
+	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01.  */
+	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04.  */
+	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00.  */
+	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02.  */
+	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01.  */
+	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04.  */
+	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00.  */
+	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02.  */
+	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01.  */
+	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04.  */
+	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00.  */
+	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02.  */
+	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01.  */
+	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04.  */
+	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00.  */
+	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02.  */
+	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01.  */
+	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03.  */
+	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00.  */
+	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02.  */
+	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01.  */
+	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03.  */
+	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00.  */
+	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02.  */
+	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01.  */
+	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03.  */
+	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00.  */
+	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02.  */
+	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01.  */
+	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03.  */
+	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00.  */
+	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02.  */
+	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01.  */
+	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03.  */
+	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00.  */
+	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01.  */
+	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01.  */
+	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03.  */
+	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00.  */
+	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01.  */
+	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01.  */
+	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03.  */
+	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00.  */
+	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01.  */
+	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01.  */
+	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03.  */
+	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00.  */
+	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01.  */
+	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01.  */
+	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03.  */
+	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00.  */
+	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01.  */
+	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01.  */
+	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03.  */
+	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00.  */
+	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01.  */
+	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01.  */
+	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03.  */
+	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00.  */
+	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01.  */
+	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01.  */
+	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03.  */
+	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00.  */
+	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01.  */
+	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01.  */
+	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03.  */
+	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00.  */
+	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01.  */
+	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02.  */
+	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02.  */
+	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00.  */
+	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01.  */
+	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02.  */
+	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02.  */
+	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00.  */
+	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01.  */
+	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02.  */
+	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02.  */
+	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00.  */
+	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01.  */
+	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02.  */
+	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02.  */
+	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00.  */
+	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01.  */
+	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02.  */
+	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02.  */
+	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00.  */
+	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01.  */
+	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03.  */
+	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02.  */
+	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00.  */
+	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01.  */
+	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03.  */
+	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02.  */
+	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00.  */
+	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01.  */
+	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02.  */
+	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02.  */
+	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00.  */
+	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01.  */
+	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02.  */
+	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02.  */
+	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00.  */
+	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01.  */
+	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02.  */
+	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02.  */
+	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00.  */
+	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01.  */
+	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02.  */
+	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02.  */
+	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00.  */
+	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01.  */
+	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02.  */
+	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02.  */
+	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00.  */
+	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01.  */
+	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02.  */
+	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02.  */
+	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00.  */
+	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01.  */
+	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02.  */
+	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02.  */
+	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00.  */
+	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01.  */
+	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02.  */
+	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02.  */
+	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00.  */
+	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01.  */
+	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02.  */
+	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02.  */
+	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00.  */
+	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01.  */
+	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01.  */
+	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02.  */
+	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00.  */
+	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01.  */
+	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01.  */
+	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02.  */
+	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00.  */
+	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01.  */
+	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01.  */
+	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02.  */
+	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00.  */
+	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01.  */
+	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01.  */
+	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02.  */
+	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00.  */
+	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01.  */
+	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01.  */
+	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02.  */
+	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00.  */
+	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01.  */
+	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01.  */
+	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02.  */
+	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00.  */
+	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01.  */
+	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01.  */
+	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02.  */
+	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00.  */
+	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01.  */
+	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02.  */
+	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02.  */
+	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00.  */
+	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01.  */
+	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02.  */
+	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02.  */
+	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00.  */
+	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01.  */
+	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02.  */
+	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03.  */
+	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00.  */
+	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01.  */
+	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02.  */
+	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02.  */
+	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00.  */
+	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01.  */
+	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02.  */
+	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02.  */
+	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00.  */
+	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01.  */
+	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02.  */
+	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02.  */
+	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00.  */
+	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01.  */
+	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02.  */
+	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01.  */
+	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01.  */
+	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01.  */
+	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02.  */
+	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01.  */
+	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01.  */
+	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01.  */
+	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02.  */
+	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01.  */
+	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01.  */
+	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01.  */
+	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02.  */
+	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01.  */
+	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01.  */
+	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01.  */
+	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02.  */
+	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01.  */
+	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01.  */
+	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01.  */
+	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02.  */
+	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01.  */
+	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01.  */
+	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01.  */
+	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02.  */
+	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01.  */
+	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01.  */
+	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01.  */
+	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02.  */
+	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01.  */
+	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01.  */
+	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01.  */
+	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02.  */
+	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01.  */
+	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01.  */
+	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01.  */
+	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02.  */
+	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01.  */
+	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01.  */
+	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02.  */
+	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03.  */
+	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01.  */
+	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01.  */
+	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02.  */
+	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03.  */
+	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01.  */
+	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01.  */
+	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02.  */
+	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03.  */
+	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01.  */
+	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01.  */
+	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02.  */
+	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03.  */
+	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01.  */
+	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01.  */
+	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02.  */
+	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03.  */
+	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01.  */
+	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01.  */
+	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02.  */
+	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03.  */
+	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01.  */
+	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01.  */
+	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02.  */
+	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03.  */
+	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01.  */
+	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02.  */
+	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02.  */
+	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03.  */
+	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01.  */
+	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02.  */
+	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02.  */
+	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03.  */
+	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01.  */
+	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02.  */
+	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02.  */
+	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03.  */
+	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01.  */
+	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02.  */
+	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03.  */
+	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04.  */
+	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01.  */
+	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02.  */
+	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03.  */
+	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04.  */
+	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01.  */
+	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02.  */
+	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03.  */
+	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04.  */
+	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01.  */
+	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02.  */
+	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03.  */
+	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04.  */
+	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01.  */
+	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03.  */
+	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03.  */
+	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05.  */
+	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01.  */
+	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03.  */
+	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03.  */
+	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05.  */
+	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01.  */
+	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03.  */
+	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04.  */
+	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05.  */
+	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01.  */
+	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03.  */
+	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04.  */
+	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05.  */
+	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01.  */
+	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03.  */
+	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04.  */
+	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05.  */
+	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01.  */
+	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03.  */
+	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04.  */
+	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06.  */
+	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01.  */
+	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04.  */
+	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04.  */
+	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06.  */
+	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01.  */
+	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04.  */
+	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05.  */
+	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06.  */
+	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01.  */
+	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04.  */
+	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05.  */
+	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06.  */
+	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01.  */
+	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04.  */
+	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05.  */
+	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06.  */
+	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01.  */
+	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04.  */
+	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05.  */
+	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07.  */
+	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01.  */
+	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05.  */
+	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06.  */
+	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07.  */
+	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01.  */
+	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05.  */
+	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06.  */
+	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07.  */
+	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01.  */
+	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05.  */
+	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06.  */
+	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08.  */
+	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01.  */
+	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06.  */
+	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07.  */
+	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08.  */
+	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01.  */
+	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06.  */
+	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07.  */
+	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09.  */
+	.quad	0x3ff0000000000000
+	.quad	0x0000000000000000
+	.quad	0x0000000000000000
+	.quad	0x0000000000000000
+
+	.align	32
+	.type	__svml_stanh_data_internal_avx2, @object
+	.size	__svml_stanh_data_internal_avx2, .-__svml_stanh_data_internal_avx2
+	.type	__svml_stanh_data_internal, @object
+	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-06-09  0:05   ` [PATCH v2 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
@ 2022-06-09  0:05   ` Noah Goldstein
  2022-06-09 16:10     ` H.J. Lu
  2022-06-09  0:05   ` [PATCH v2 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
  2022-06-09 15:59   ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:05 UTC (permalink / raw)
  To: libc-alpha

Optimizations are:
    1. Reduce code size (-81 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata size (-32 bytes).

Result is roughly a 17-18% speedup:

       Function, New Time, Old Time, New / Old
_ZGVdN8v_tanhf,     1.977,    2.402,     0.823
---
 .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 912 ++++--------------
 1 file changed, 171 insertions(+), 741 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index c5c87bf5b0..a47ede0501 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -70,773 +70,203 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP				0
-#define _sSignMask			4288
-#define _sAbsMask			4320
-#define _iExpMantMask			4352
-#define _iExpMask			4384
-#define _iMinIdxOfsMask			4416
-#define _iMaxIdxMask			4448
-
 #include <sysdep.h>
 
+/* tanhf data tables for avx2 and sse4 implementatins defined here.
+ */
+#include "svml_s_tanhf_rodata.S"
+
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_tanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	pushq	%r12
-	subq	$120, %rsp
-	lea	_dbP+16+__svml_stanh_data_internal(%rip), %r10
-	vmovaps	%ymm0, %ymm12
-
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpand	_iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
+	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
+	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
+
+	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
+	vpxor	%ymm3, %ymm3, %ymm3
+	vpmaxsd	%ymm3, %ymm2, %ymm2
+	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
 
 	/*
 	 *  small table specific variables *
 	 *  Constant loading
 	 */
-	vmovups	_iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
-	vpsubd	_iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
-
-	/* if VMIN, VMAX is defined for I type */
-	vxorps	%ymm15, %ymm15, %ymm15
-	vpcmpgtd %ymm15, %ymm9, %ymm0
-	vpand	%ymm0, %ymm9, %ymm7
-	vpcmpgtd %ymm8, %ymm9, %ymm6
-	vblendvps %ymm6, %ymm8, %ymm7, %ymm3
-	vpsrld	$14, %ymm3, %ymm1
-	vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
-	vmovmskps %ymm13, %r11d
-	vandps	_sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
-	vandps	_sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
-	vextractf128 $1, %ymm1, %xmm2
-	vmovd	%xmm1, %r9d
-	vmovd	%xmm2, %ecx
-	vpextrd	$1, %xmm2, %edx
-	vpextrd	$1, %xmm1, %r8d
-	movslq	%r9d, %r9
-	movslq	%edx, %rdx
-	movslq	%r8d, %r8
-	vpextrd	$2, %xmm1, %edi
-	movslq	%ecx, %rcx
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	vpextrd	$3, %xmm2, %r12d
-	vpextrd	$3, %xmm1, %esi
-	vpextrd	$2, %xmm2, %eax
-	movslq	%edi, %rdi
-	movslq	%r12d, %r12
-	movslq	%esi, %rsi
-	movslq	%eax, %rax
-	vmovupd	-16(%r9, %r10), %xmm5
-	vmovupd	-16(%rdx, %r10), %xmm14
-	vmovupd	-16(%rcx, %r10), %xmm13
-	vmovupd	(%r9, %r10), %xmm1
-	vmovupd	(%r8, %r10), %xmm2
-	vmovupd	-16(%r8, %r10), %xmm4
-	vinsertf128 $1, -16(%rdi, %r10), %ymm5, %ymm15
-	vinsertf128 $1, -16(%r12, %r10), %ymm14, %ymm3
-	vinsertf128 $1, -16(%rax, %r10), %ymm13, %ymm6
-	vinsertf128 $1, (%rdi, %r10), %ymm1, %ymm5
-	vinsertf128 $1, (%rsi, %r10), %ymm2, %ymm14
-	vunpcklpd %ymm3, %ymm6, %ymm8
+	vpsrld	$14, %ymm2, %ymm1
+
+	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
+	   store/load as we can take advantage of store-forwarding.  */
+	vmovq	%xmm1, %r8
+	/* We have eliminated all negative values for ymm1 so no need to sign
+	   extend.  */
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+
+	/* Store base of lookup table in rax.  */
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+
+	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
+	   with memory operand. This helps alleviate bottleneck on p5.  */
+	vmovupd	16(%r9, %rax), %xmm5
+
+	vpextrq	$1, %xmm1, %rsi
+	movl	%esi, %edi
+	shrq	$32, %rsi
+
+	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
+
+	vextracti128 $1, %ymm1, %xmm2
+	vmovq	%xmm2, %rdx
+	movl	%edx, %ecx
+	shrq	$32, %rdx
+
+	vmovupd	(%rcx, %rax), %xmm6
+
+	vpextrq	$1, %xmm2, %r10
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+
+	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
+
+	vmovupd	16(%r8, %rax), %xmm1
+	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
+	vmovupd	(%rdx, %rax), %xmm3
+	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm6, %ymm7
 	vunpckhpd %ymm3, %ymm6, %ymm6
-	vunpcklpd %ymm14, %ymm5, %ymm3
-	vunpckhpd %ymm14, %ymm5, %ymm2
-	vmovupd	(%rcx, %r10), %xmm13
-	vcvtps2pd %xmm10, %ymm5
-	vextractf128 $1, %ymm10, %xmm10
-	vfmadd213pd %ymm3, %ymm5, %ymm2
-	vinsertf128 $1, -16(%rsi, %r10), %ymm4, %ymm0
-	vmovupd	(%rdx, %r10), %xmm4
-	vunpcklpd %ymm0, %ymm15, %ymm9
-	vunpckhpd %ymm0, %ymm15, %ymm7
-	vfmadd213pd %ymm7, %ymm5, %ymm2
-	vfmadd213pd %ymm9, %ymm5, %ymm2
-	vinsertf128 $1, (%r12, %r10), %ymm4, %ymm0
-	vcvtps2pd %xmm10, %ymm4
-	vinsertf128 $1, (%rax, %r10), %ymm13, %ymm15
-	vunpcklpd %ymm0, %ymm15, %ymm1
-	vunpckhpd %ymm0, %ymm15, %ymm0
-	vfmadd213pd %ymm1, %ymm4, %ymm0
-	vcvtpd2ps %ymm2, %xmm1
-	vfmadd213pd %ymm6, %ymm4, %ymm0
-	vfmadd213pd %ymm8, %ymm4, %ymm0
-	vcvtpd2ps %ymm0, %xmm0
-	vinsertf128 $1, %xmm0, %ymm1, %ymm2
-	vorps	%ymm11, %ymm2, %ymm0
-	testl	%r11d, %r11d
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r13 r14 r15 r11d ymm0 ymm12
+	vunpcklpd %ymm1, %ymm5, %ymm3
+	vunpckhpd %ymm1, %ymm5, %ymm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
+	/* Store special cases in ymm15.  */
+	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
 
-L(EXIT):
-	addq	$120, %rsp
-	cfi_restore(12)
-	popq	%r12
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
+	vandps	%ymm11, %ymm0, %ymm4
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vcvtps2pd %xmm4, %ymm5
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm12, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r13 r14 r15 r11d ymm0
+	vextractf128 $1, %ymm4, %xmm4
+	vcvtps2pd %xmm4, %ymm4
 
-	xorl	%r12d, %r12d
-	# LOE rbx r13 r14 r15 r11d r12d
+	vmovupd	16(%rcx, %rax), %xmm2
+	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
 
-	vzeroupper
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	movl	%r11d, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	vfmadd213pd %ymm3, %ymm5, %ymm1
+
+	vmovupd	16(%rdx, %rax), %xmm3
+	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm2, %ymm10
+	vunpckhpd %ymm3, %ymm2, %ymm2
+
+	vfmadd213pd %ymm10, %ymm4, %ymm2
+	vfmadd213pd %ymm6, %ymm4, %ymm2
+	vfmadd213pd %ymm7, %ymm4, %ymm2
+	vcvtpd2ps %ymm2, %xmm2
+
+	vmovupd	(%r9, %rax), %xmm7
+	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
+
+	vmovupd	(%r8, %rax), %xmm3
+	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
+
+	vunpckhpd %ymm3, %ymm7, %ymm4
+	vunpcklpd %ymm3, %ymm7, %ymm7
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	vfmadd213pd %ymm4, %ymm5, %ymm1
+	vfmadd213pd %ymm7, %ymm5, %ymm1
+
+
+	vcvtpd2ps %ymm1, %xmm1
+	vinsertf128 $1, %xmm2, %ymm1, %ymm1
+
+	vmovmskps %ymm15, %edx
+	vandnps	%ymm0, %ymm11, %ymm2
+	testl	%edx, %edx
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
+	/* Wait until after branch of write over ymm0.  */
+	vorps	%ymm2, %ymm1, %ymm0
+	/* No stack restoration on the fastpath.  */
+	ret
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
 
-	/* Special inputs
-	 * processing loop
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   moreso than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
+
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save all already computed inputs.  */
+	vorps	%ymm2, %ymm1, %ymm1
+	vmovaps	%ymm1, (%rsp)
+	/* Save origional input (ymm0 unchanged up to this point).  */
+	vmovaps	%ymm0, 32(%rsp)
+
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
-END(_ZGVdN8v_tanhf_avx2)
 
-	.section .rodata, "a"
-	.align	32
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 _dbP[(134*4)][2];
-	__declspec(align(32)) VUINT32 _sSignMask[8][1];
-	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMask[8][1];
-	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
-	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	32
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
-	.align	32
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
-	.align	32
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
-	.align	32
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
-	.align	32
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
-	.align	32
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+	/* All results have been written to 32(%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
+END(_ZGVdN8v_tanhf_avx2)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-06-09  0:05   ` [PATCH v2 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09  0:05   ` Noah Goldstein
  2022-06-09 15:59   ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
  6 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:05 UTC (permalink / raw)
  To: libc-alpha

Optimizations are:
    1. Reduce code size (-112 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata size (-4k+ rodata is shared with avx2).

Result is roughly a 15-16% speedup:

       Function, New Time, Old Time, New / Old
 _ZGVbN4v_tanhf,    3.158,    3.749,     0.842
---
 .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 865 +++---------------
 1 file changed, 138 insertions(+), 727 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
index 532ebbac65..da56a9d794 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
@@ -70,761 +70,172 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP				0
-#define _sSignMask			4288
-#define _sAbsMask			4304
-#define _iExpMantMask			4320
-#define _iExpMask			4336
-#define _iMinIdxOfsMask			4352
-#define _iMaxIdxMask			4368
 
 #include <sysdep.h>
 
+/* tanhf data tables for avx2 and sse4 implementatins defined here.
+ */
+#define ONLY_DECL_OFFSET
+#include "svml_s_tanhf_rodata.S"
+
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_tanhf_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
-	movaps	%xmm0, %xmm5
+	/* Save copy of input in xmm12.  */
+	movaps	%xmm0, %xmm12
 
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	movdqu	_iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
-	lea	_dbP+16+__svml_stanh_data_internal(%rip), %r8
-	pand	%xmm5, %xmm9
+	movdqu	TANHF_DATA(_iExpMantMask)(%rip), %xmm3
+	pand	%xmm0, %xmm3
 
-	/* if VMIN, VMAX is defined for I type */
+
+	/* Selection of arguments between [0, 0x04280000] into xmm3.  */
 	pxor	%xmm7, %xmm7
-	movdqa	%xmm9, %xmm6
-	psubd	_iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
+	/* Save xmm3 for special values check at end.  */
+	movdqa	%xmm3, %xmm8
+	psubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
+	pmaxsd	%xmm7, %xmm3
+	pminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
+	psrld	$14, %xmm3
+
+	movq	%xmm3, %rcx
+	movl	%ecx, %edx
+	shrq	$32, %rcx
+
+	pshufd	$0x0e, %xmm3, %xmm3
+	movq	%xmm3, %rdi
+	movl	%edi, %esi
+	shrq	$32, %rdi
+
+	movaps	TANHF_DATA(_sAbsMask)(%rip), %xmm1
+	andps	%xmm1, %xmm0
+
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+	movups	(%rdx, %rax), %xmm2
+	movups	(%rcx, %rax), %xmm6
 
 	/*
 	 *  small table specific variables *
 	 *  Constant loading
 	 */
-	movdqu	_iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
-	movdqa	%xmm9, %xmm11
-	movdqa	%xmm9, %xmm8
-	pcmpgtd	%xmm10, %xmm11
-	pcmpgtd	%xmm7, %xmm8
-	movdqa	%xmm11, %xmm14
-	pand	%xmm8, %xmm9
-	andps	%xmm11, %xmm10
-	andnps	%xmm9, %xmm14
-	orps	%xmm10, %xmm14
-	psrld	$14, %xmm14
-	movd	%xmm14, %edx
-	pshufd	$1, %xmm14, %xmm12
-	pshufd	$2, %xmm14, %xmm13
-	movd	%xmm12, %ecx
-	pshufd	$3, %xmm14, %xmm15
-	movups	_sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
-	movslq	%edx, %rdx
-	andps	%xmm5, %xmm3
-	movslq	%ecx, %rcx
-	pcmpgtd	_iExpMask+__svml_stanh_data_internal(%rip), %xmm6
-	movd	%xmm13, %esi
-	movups	-16(%rdx, %r8), %xmm2
-	movaps	%xmm2, %xmm0
-	movd	%xmm15, %edi
-	movmskps %xmm6, %eax
-	movups	-16(%rcx, %r8), %xmm6
-	unpcklpd %xmm6, %xmm0
+	movaps	%xmm2, %xmm4
+	movlhps	%xmm6, %xmm4
 	unpckhpd %xmm6, %xmm2
-	cvtps2pd %xmm3, %xmm6
-	movhlps	%xmm3, %xmm3
-	cvtps2pd %xmm3, %xmm3
-	movslq	%esi, %rsi
-	movslq	%edi, %rdi
-	movups	(%rcx, %r8), %xmm8
-	movups	(%rdx, %r8), %xmm12
-	movups	(%rsi, %r8), %xmm13
-	movaps	%xmm12, %xmm10
-	movups	(%rdi, %r8), %xmm9
+
+	cvtps2pd %xmm0, %xmm6
+	movhlps	%xmm0, %xmm0
+	cvtps2pd %xmm0, %xmm0
+
+	movups	16(%rdx, %rax), %xmm5
+	movups	16(%rsi, %rax), %xmm13
+
+	movaps	%xmm5, %xmm10
 	movaps	%xmm13, %xmm11
-	unpckhpd %xmm8, %xmm12
-	unpckhpd %xmm9, %xmm13
-	mulpd	%xmm6, %xmm12
-	mulpd	%xmm3, %xmm13
-	unpcklpd %xmm8, %xmm10
-	unpcklpd %xmm9, %xmm11
-	addpd	%xmm10, %xmm12
+
+	movups	16(%rcx, %rax), %xmm7
+	movups	16(%rdi, %rax), %xmm3
+
+	unpckhpd %xmm7, %xmm5
+	unpckhpd %xmm3, %xmm13
+
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
+
+	movlhps	%xmm7, %xmm10
+	movlhps	%xmm3, %xmm11
+
+	addpd	%xmm10, %xmm5
 	addpd	%xmm11, %xmm13
-	mulpd	%xmm6, %xmm12
-	mulpd	%xmm3, %xmm13
-	addpd	%xmm2, %xmm12
-	movups	-16(%rsi, %r8), %xmm1
-	movups	-16(%rdi, %r8), %xmm7
-	movaps	%xmm1, %xmm14
-	unpckhpd %xmm7, %xmm1
-	addpd	%xmm1, %xmm13
-	mulpd	%xmm12, %xmm6
-	mulpd	%xmm13, %xmm3
-	addpd	%xmm0, %xmm6
-	unpcklpd %xmm7, %xmm14
-	addpd	%xmm14, %xmm3
-	cvtpd2ps %xmm6, %xmm0
-	cvtpd2ps %xmm3, %xmm1
-	movups	_sSignMask+__svml_stanh_data_internal(%rip), %xmm4
-	movlhps	%xmm1, %xmm0
-	andps	%xmm5, %xmm4
-	orps	%xmm4, %xmm0
-	testl	%eax, %eax
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	addpd	%xmm2, %xmm5
 
-L(EXIT):
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
-	ret
-	cfi_def_cfa_offset(80)
+	movups	(%rsi, %rax), %xmm2
+	movups	(%rdi, %rax), %xmm7
 
-	/* Branch to process
-	 * special inputs
-	 */
+	movaps	%xmm2, %xmm3
 
-L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm5, 32(%rsp)
-	movups	%xmm0, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 eax
-
-	xorl	%edx, %edx
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%edx, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%eax, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	unpckhpd %xmm7, %xmm2
+	movlhps	%xmm7, %xmm3
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	addpd	%xmm13, %xmm2
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
+	mulpd	%xmm5, %xmm6
+	addpd	%xmm4, %xmm6
 
-	/* Special inputs
-	 * processing loop
-	 */
+	mulpd	%xmm2, %xmm0
+	addpd	%xmm3, %xmm0
 
-L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	cvtpd2ps %xmm0, %xmm2
+	cvtpd2ps %xmm6, %xmm0
 
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	movlhps	%xmm2, %xmm0
+	andnps	%xmm12, %xmm1
+	orps	%xmm1, %xmm0
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	call	tanhf@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
+	/* xmm8 contains mask of special values.  */
+	pcmpgtd	TANHF_DATA(_iExpMask)(%rip), %xmm8
 
-	movss	%xmm0, 48(%rsp, %r14, 4)
+	movmskps %xmm8, %edx
+	testl	%edx, %edx
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
-END(_ZGVbN4v_tanhf_sse4)
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	/* No stack restoration on the fastpath.  */
+	ret
 
-	.section .rodata, "a"
-	.align	16
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 _dbP[(134*4)][2];
-	__declspec(align(16)) VUINT32 _sSignMask[4][1];
-	__declspec(align(16)) VUINT32 _sAbsMask[4][1];
-	__declspec(align(16)) VUINT32 _iExpMantMask[4][1];
-	__declspec(align(16)) VUINT32 _iExpMask[4][1];
-	__declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
-	__declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.align	16
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	16
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
-	.align	16
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
-	.align	16
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
-	.align	16
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
-	.align	16
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
-	.align	16
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   moreso than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+       call entry will be 16-byte aligned. */
+	subq	$56, %rsp
+	cfi_def_cfa_offset(64)
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm12, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+       encoding for many instructions (as compared with r12/r13). */
+	movq	%rbx, (%rsp)
+	cfi_offset(rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset(rbp, -56)
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
+L(SPECIAL_VALUES_LOOP):
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
+	call	tanhf@PLT
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+	/* All results have been written to 16(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore(rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore(rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset(8)
+	ret
+END(_ZGVbN4v_tanhf_sse4)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S
  2022-06-08  3:07   ` H.J. Lu
@ 2022-06-09  0:06     ` Noah Goldstein
  0 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09  0:06 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Kolesov, Andrey

On Tue, Jun 7, 2022 at 8:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jun 7, 2022 at 1:07 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Optimizations are:
> >     1. Reduce code size (-112 bytes).
> >     2. Remove redundant move instructions.
> >     3. Slightly improve instruction selection/scheduling where
> >        possible.
> >     4. Prefer registers which get short instruction encoding.
> >     5. Reduce rodata size (-4k+ rodata is shared with avx2).
> >
> > Result is roughly a 15-16% speedup:
> >
> >        Function, New Time, Old Time, New / Old
> >  _ZGVbN4v_tanhf,    3.158,    3.749,     0.842
> > ---
> >  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 864 +++---------------
> >  1 file changed, 137 insertions(+), 727 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > index 532ebbac65..54580ebd79 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > @@ -70,761 +70,171 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal
> > - */
> > -#define _dbP                           0
> > -#define _sSignMask                     4288
> > -#define _sAbsMask                      4304
> > -#define _iExpMantMask                  4320
> > -#define _iExpMask                      4336
> > -#define _iMinIdxOfsMask                        4352
> > -#define _iMaxIdxMask                   4368
> >
> >  #include <sysdep.h>
> >
> > +/* tanhf data tables for avx2 and sse4 implementatins defined here.
> > + */
> > +#define ONLY_DECL_OFFSET
> > +#include "svml_s_tanhf_rodata.S"
> > +
> >         .section .text.sse4, "ax", @progbits
> >  ENTRY(_ZGVbN4v_tanhf_sse4)
> > -       subq    $72, %rsp
> > -       cfi_def_cfa_offset(80)
> > -       movaps  %xmm0, %xmm5
> > +       /* Save copy of input in xmm12.  */
> > +       movaps  %xmm0, %xmm12
> >
> >         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -       movdqu  _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> > -       lea     _dbP+16+__svml_stanh_data_internal(%rip), %r8
> > -       pand    %xmm5, %xmm9
> > +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> > +       pand    %xmm0, %xmm3
> >
> > -       /* if VMIN, VMAX is defined for I type */
> > +
> > +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
> >         pxor    %xmm7, %xmm7
> > -       movdqa  %xmm9, %xmm6
> > -       psubd   _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> > +       /* Save xmm3 for special values check at end.  */
> > +       movdqa  %xmm3, %xmm8
> > +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> > +       pmaxsd  %xmm7, %xmm3
> > +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> > +       psrld   $14, %xmm3
> > +
> > +       movq    %xmm3, %rcx
> > +       movl    %ecx, %edx
> > +       shrq    $32, %rcx
> > +
> > +       pshufd  $0x0e, %xmm3, %xmm3
> > +       movq    %xmm3, %rdi
> > +       movl    %edi, %esi
> > +       shrq    $32, %rdi
> > +
> > +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> > +       andps   %xmm1, %xmm0
> > +
> > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > +       movups  (%rdx, %rax), %xmm2
> > +       movups  (%rcx, %rax), %xmm6
> >
> >         /*
> >          *  small table specific variables *
> >          *  Constant loading
> >          */
> > -       movdqu  _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> > -       movdqa  %xmm9, %xmm11
> > -       movdqa  %xmm9, %xmm8
> > -       pcmpgtd %xmm10, %xmm11
> > -       pcmpgtd %xmm7, %xmm8
> > -       movdqa  %xmm11, %xmm14
> > -       pand    %xmm8, %xmm9
> > -       andps   %xmm11, %xmm10
> > -       andnps  %xmm9, %xmm14
> > -       orps    %xmm10, %xmm14
> > -       psrld   $14, %xmm14
> > -       movd    %xmm14, %edx
> > -       pshufd  $1, %xmm14, %xmm12
> > -       pshufd  $2, %xmm14, %xmm13
> > -       movd    %xmm12, %ecx
> > -       pshufd  $3, %xmm14, %xmm15
> > -       movups  _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> > -       movslq  %edx, %rdx
> > -       andps   %xmm5, %xmm3
> > -       movslq  %ecx, %rcx
> > -       pcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> > -       movd    %xmm13, %esi
> > -       movups  -16(%rdx, %r8), %xmm2
> > -       movaps  %xmm2, %xmm0
> > -       movd    %xmm15, %edi
> > -       movmskps %xmm6, %eax
> > -       movups  -16(%rcx, %r8), %xmm6
> > -       unpcklpd %xmm6, %xmm0
> > +       movaps  %xmm2, %xmm4
> > +       movlhps %xmm6, %xmm4
> >         unpckhpd %xmm6, %xmm2
> > -       cvtps2pd %xmm3, %xmm6
> > -       movhlps %xmm3, %xmm3
> > -       cvtps2pd %xmm3, %xmm3
> > -       movslq  %esi, %rsi
> > -       movslq  %edi, %rdi
> > -       movups  (%rcx, %r8), %xmm8
> > -       movups  (%rdx, %r8), %xmm12
> > -       movups  (%rsi, %r8), %xmm13
> > -       movaps  %xmm12, %xmm10
> > -       movups  (%rdi, %r8), %xmm9
> > +
> > +       cvtps2pd %xmm0, %xmm6
> > +       movhlps %xmm0, %xmm0
> > +       cvtps2pd %xmm0, %xmm0
> > +
> > +       movups  16(%rdx, %rax), %xmm5
> > +       movups  16(%rsi, %rax), %xmm13
> > +
> > +       movaps  %xmm5, %xmm10
> >         movaps  %xmm13, %xmm11
> > -       unpckhpd %xmm8, %xmm12
> > -       unpckhpd %xmm9, %xmm13
> > -       mulpd   %xmm6, %xmm12
> > -       mulpd   %xmm3, %xmm13
> > -       unpcklpd %xmm8, %xmm10
> > -       unpcklpd %xmm9, %xmm11
> > -       addpd   %xmm10, %xmm12
> > +
> > +       movups  16(%rcx, %rax), %xmm7
> > +       movups  16(%rdi, %rax), %xmm3
> > +
> > +       unpckhpd %xmm7, %xmm5
> > +       unpckhpd %xmm3, %xmm13
> > +
> > +       mulpd   %xmm6, %xmm5
> > +       mulpd   %xmm0, %xmm13
> > +
> > +       movlhps %xmm7, %xmm10
> > +       movlhps %xmm3, %xmm11
> > +
> > +       addpd   %xmm10, %xmm5
> >         addpd   %xmm11, %xmm13
> > -       mulpd   %xmm6, %xmm12
> > -       mulpd   %xmm3, %xmm13
> > -       addpd   %xmm2, %xmm12
> > -       movups  -16(%rsi, %r8), %xmm1
> > -       movups  -16(%rdi, %r8), %xmm7
> > -       movaps  %xmm1, %xmm14
> > -       unpckhpd %xmm7, %xmm1
> > -       addpd   %xmm1, %xmm13
> > -       mulpd   %xmm12, %xmm6
> > -       mulpd   %xmm13, %xmm3
> > -       addpd   %xmm0, %xmm6
> > -       unpcklpd %xmm7, %xmm14
> > -       addpd   %xmm14, %xmm3
> > -       cvtpd2ps %xmm6, %xmm0
> > -       cvtpd2ps %xmm3, %xmm1
> > -       movups  _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> > -       movlhps %xmm1, %xmm0
> > -       andps   %xmm5, %xmm4
> > -       orps    %xmm4, %xmm0
> > -       testl   %eax, %eax
> >
> > -       /* Go to special inputs processing branch */
> > -       jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> > +       mulpd   %xmm6, %xmm5
> > +       mulpd   %xmm0, %xmm13
> >
> > -       /* Restore registers
> > -        * and exit the function
> > -        */
> > +       addpd   %xmm2, %xmm5
> >
> > -L(EXIT):
> > -       addq    $72, %rsp
> > -       cfi_def_cfa_offset(8)
> > -       ret
> > -       cfi_def_cfa_offset(80)
> > +       movups  (%rsi, %rax), %xmm2
> > +       movups  (%rdi, %rax), %xmm7
> >
> > -       /* Branch to process
> > -        * special inputs
> > -        */
> > +       movaps  %xmm2, %xmm3
> >
> > -L(SPECIAL_VALUES_BRANCH):
> > -       movups  %xmm5, 32(%rsp)
> > -       movups  %xmm0, 48(%rsp)
> > -       # LOE rbx rbp r12 r13 r14 r15 eax
> > -
> > -       xorl    %edx, %edx
> > -       movq    %r12, 16(%rsp)
> > -       cfi_offset(12, -64)
> > -       movl    %edx, %r12d
> > -       movq    %r13, 8(%rsp)
> > -       cfi_offset(13, -72)
> > -       movl    %eax, %r13d
> > -       movq    %r14, (%rsp)
> > -       cfi_offset(14, -80)
> > -       # LOE rbx rbp r15 r12d r13d
> > -
> > -       /* Range mask
> > -        * bits check
> > -        */
> > +       unpckhpd %xmm7, %xmm2
> > +       movlhps %xmm7, %xmm3
> >
> > -L(RANGEMASK_CHECK):
> > -       btl     %r12d, %r13d
> > +       addpd   %xmm13, %xmm2
> >
> > -       /* Call scalar math function */
> > -       jc      L(SCALAR_MATH_CALL)
> > -       # LOE rbx rbp r15 r12d r13d
> > +       mulpd   %xmm5, %xmm6
> > +       addpd   %xmm4, %xmm6
> >
> > -       /* Special inputs
> > -        * processing loop
> > -        */
> > +       mulpd   %xmm2, %xmm0
> > +       addpd   %xmm3, %xmm0
> >
> > -L(SPECIAL_VALUES_LOOP):
> > -       incl    %r12d
> > -       cmpl    $4, %r12d
> > -
> > -       /* Check bits in range mask */
> > -       jl      L(RANGEMASK_CHECK)
> > -       # LOE rbx rbp r15 r12d r13d
> > -
> > -       movq    16(%rsp), %r12
> > -       cfi_restore(12)
> > -       movq    8(%rsp), %r13
> > -       cfi_restore(13)
> > -       movq    (%rsp), %r14
> > -       cfi_restore(14)
> > -       movups  48(%rsp), %xmm0
> > -
> > -       /* Go to exit */
> > -       jmp     L(EXIT)
> > -       cfi_offset(12, -64)
> > -       cfi_offset(13, -72)
> > -       cfi_offset(14, -80)
> > -       # LOE rbx rbp r12 r13 r14 r15 xmm0
> > +       cvtpd2ps %xmm0, %xmm2
> > +       cvtpd2ps %xmm6, %xmm0
> >
> > -       /* Scalar math fucntion call
> > -        * to process special input
> > -        */
> > +       movlhps %xmm2, %xmm0
> > +       andnps  %xmm12, %xmm1
> > +       orps    %xmm1, %xmm0
> >
> > -L(SCALAR_MATH_CALL):
> > -       movl    %r12d, %r14d
> > -       movss   32(%rsp, %r14, 4), %xmm0
> > -       call    tanhf@PLT
> > -       # LOE rbx rbp r14 r15 r12d r13d xmm0
> > +       /* xmm8 contains mask of special values.  */
> > +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
> >
> > -       movss   %xmm0, 48(%rsp, %r14, 4)
> > +       movmskps %xmm8, %edx
> > +       testl   %edx, %edx
> >
> > -       /* Process special inputs in loop */
> > -       jmp     L(SPECIAL_VALUES_LOOP)
> > -       # LOE rbx rbp r15 r12d r13d
> > -END(_ZGVbN4v_tanhf_sse4)
> > +       /* Go to special inputs processing branch */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +       # LOE rbx rbp r12 r13 r14 r15 xmm0
> > +       /* No stack restoration on the fastpath.  */
> > +       ret
> >
> > -       .section .rodata, "a"
> > -       .align  16
> > -
> > -#ifdef __svml_stanh_data_internal_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct {
> > -       __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> > -       __declspec(align(16)) VUINT32 _sSignMask[4][1];
> > -       __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> > -       __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> > -       __declspec(align(16)) VUINT32 _iExpMask[4][1];
> > -       __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> > -       __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> > -} __svml_stanh_data_internal;
> > -#endif
> > -__svml_stanh_data_internal:
> > -       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > -       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
> > -       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
> > -       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
> > -       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
> > -       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
> > -       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
> > -       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
> > -       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
> > -       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
> > -       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
> > -       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
> > -       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
> > -       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
> > -       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
> > -       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
> > -       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
> > -       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
> > -       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
> > -       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
> > -       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
> > -       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
> > -       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
> > -       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
> > -       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
> > -       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
> > -       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
> > -       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
> > -       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
> > -       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
> > -       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
> > -       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
> > -       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
> > -       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
> > -       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
> > -       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
> > -       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
> > -       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
> > -       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
> > -       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
> > -       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
> > -       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
> > -       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
> > -       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
> > -       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
> > -       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
> > -       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
> > -       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
> > -       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
> > -       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
> > -       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
> > -       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
> > -       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
> > -       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
> > -       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
> > -       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
> > -       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
> > -       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
> > -       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
> > -       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
> > -       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
> > -       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
> > -       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
> > -       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
> > -       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
> > -       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
> > -       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
> > -       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
> > -       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
> > -       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
> > -       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
> > -       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
> > -       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
> > -       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
> > -       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
> > -       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
> > -       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
> > -       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
> > -       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
> > -       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
> > -       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
> > -       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
> > -       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
> > -       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
> > -       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
> > -       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
> > -       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
> > -       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
> > -       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
> > -       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
> > -       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
> > -       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
> > -       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
> > -       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
> > -       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
> > -       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
> > -       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
> > -       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
> > -       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
> > -       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
> > -       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
> > -       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
> > -       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
> > -       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
> > -       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
> > -       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
> > -       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
> > -       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
> > -       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
> > -       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
> > -       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
> > -       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
> > -       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
> > -       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
> > -       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
> > -       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
> > -       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
> > -       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
> > -       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
> > -       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
> > -       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
> > -       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
> > -       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
> > -       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
> > -       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
> > -       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
> > -       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
> > -       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
> > -       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
> > -       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
> > -       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
> > -       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
> > -       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
> > -       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
> > -       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
> > -       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
> > -       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
> > -       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
> > -       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
> > -       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
> > -       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
> > -       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
> > -       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
> > -       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
> > -       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
> > -       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
> > -       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
> > -       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
> > -       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
> > -       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
> > -       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
> > -       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
> > -       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
> > -       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
> > -       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
> > -       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
> > -       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
> > -       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
> > -       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
> > -       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
> > -       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
> > -       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
> > -       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
> > -       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
> > -       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
> > -       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
> > -       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
> > -       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
> > -       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
> > -       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
> > -       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
> > -       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
> > -       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
> > -       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
> > -       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
> > -       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
> > -       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
> > -       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
> > -       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
> > -       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
> > -       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
> > -       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
> > -       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
> > -       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
> > -       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
> > -       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
> > -       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
> > -       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
> > -       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
> > -       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
> > -       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
> > -       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
> > -       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
> > -       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
> > -       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
> > -       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
> > -       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
> > -       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
> > -       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
> > -       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
> > -       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
> > -       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
> > -       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
> > -       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
> > -       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
> > -       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
> > -       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
> > -       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
> > -       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
> > -       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
> > -       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
> > -       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
> > -       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
> > -       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
> > -       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
> > -       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
> > -       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
> > -       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
> > -       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
> > -       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
> > -       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
> > -       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
> > -       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
> > -       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
> > -       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
> > -       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
> > -       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
> > -       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
> > -       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
> > -       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
> > -       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
> > -       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
> > -       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
> > -       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
> > -       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
> > -       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
> > -       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
> > -       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
> > -       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
> > -       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
> > -       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
> > -       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
> > -       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
> > -       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
> > -       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
> > -       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
> > -       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
> > -       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
> > -       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
> > -       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
> > -       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
> > -       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
> > -       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
> > -       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
> > -       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
> > -       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
> > -       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
> > -       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
> > -       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
> > -       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
> > -       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
> > -       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
> > -       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
> > -       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
> > -       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
> > -       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
> > -       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
> > -       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
> > -       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
> > -       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
> > -       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
> > -       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
> > -       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
> > -       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
> > -       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
> > -       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
> > -       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
> > -       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
> > -       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
> > -       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
> > -       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
> > -       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
> > -       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
> > -       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
> > -       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
> > -       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
> > -       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
> > -       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
> > -       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
> > -       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
> > -       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
> > -       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
> > -       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
> > -       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
> > -       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
> > -       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
> > -       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
> > -       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
> > -       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
> > -       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
> > -       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
> > -       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
> > -       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
> > -       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
> > -       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
> > -       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
> > -       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
> > -       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
> > -       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
> > -       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
> > -       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
> > -       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
> > -       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
> > -       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
> > -       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
> > -       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
> > -       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
> > -       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
> > -       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
> > -       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
> > -       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
> > -       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
> > -       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
> > -       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
> > -       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
> > -       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
> > -       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
> > -       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
> > -       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
> > -       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
> > -       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
> > -       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
> > -       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
> > -       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
> > -       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
> > -       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
> > -       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
> > -       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
> > -       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
> > -       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
> > -       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
> > -       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
> > -       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
> > -       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
> > -       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
> > -       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
> > -       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
> > -       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
> > -       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
> > -       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
> > -       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
> > -       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
> > -       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
> > -       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
> > -       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
> > -       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
> > -       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
> > -       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
> > -       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
> > -       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
> > -       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
> > -       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
> > -       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
> > -       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
> > -       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
> > -       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
> > -       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
> > -       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
> > -       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
> > -       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
> > -       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
> > -       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
> > -       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
> > -       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
> > -       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
> > -       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
> > -       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
> > -       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
> > -       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
> > -       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
> > -       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
> > -       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
> > -       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
> > -       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
> > -       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
> > -       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
> > -       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
> > -       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
> > -       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
> > -       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
> > -       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
> > -       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
> > -       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
> > -       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
> > -       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
> > -       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
> > -       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
> > -       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
> > -       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
> > -       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
> > -       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
> > -       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
> > -       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
> > -       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
> > -       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
> > -       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
> > -       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
> > -       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
> > -       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
> > -       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
> > -       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
> > -       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
> > -       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
> > -       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
> > -       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
> > -       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
> > -       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
> > -       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
> > -       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
> > -       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
> > -       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
> > -       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
> > -       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
> > -       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
> > -       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
> > -       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
> > -       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
> > -       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
> > -       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
> > -       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
> > -       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
> > -       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
> > -       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
> > -       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
> > -       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
> > -       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
> > -       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
> > -       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
> > -       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
> > -       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
> > -       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
> > -       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
> > -       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
> > -       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
> > -       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
> > -       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
> > -       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
> > -       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
> > -       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
> > -       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
> > -       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
> > -       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
> > -       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
> > -       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
> > -       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
> > -       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
> > -       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
> > -       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
> > -       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
> > -       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
> > -       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
> > -       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
> > -       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
> > -       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
> > -       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
> > -       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
> > -       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
> > -       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
> > -       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
> > -       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
> > -       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
> > -       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
> > -       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
> > -       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
> > -       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
> > -       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
> > -       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
> > -       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
> > -       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
> > -       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
> > -       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
> > -       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
> > -       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
> > -       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
> > -       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
> > -       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
> > -       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
> > -       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
> > -       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
> > -       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
> > -       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
> > -       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
> > -       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
> > -       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
> > -       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
> > -       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
> > -       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
> > -       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
> > -       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
> > -       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
> > -       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
> > -       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
> > -       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
> > -       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
> > -       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
> > -       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
> > -       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
> > -       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
> > -       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
> > -       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
> > -       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
> > -       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
> > -       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
> > -       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
> > -       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
> > -       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
> > -       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
> > -       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
> > -       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
> > -       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
> > -       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
> > -       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
> > -       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
> > -       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
> > -       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
> > -       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
> > -       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
> > -       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
> > -       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
> > -       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
> > -       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
> > -       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
> > -       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
> > -       .quad   0x3ff0000000000000
> > -       .quad   0x0000000000000000
> > -       .quad   0x0000000000000000
> > -       .quad   0x0000000000000000
> > -       .align  16
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> > -       .align  16
> > -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> > -       .align  16
> > -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> > -       .align  16
> > -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> > -       .align  16
> > -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> > -       .align  16
> > -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> > -       .align  16
> > -       .type   __svml_stanh_data_internal, @object
> > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > +       /* Cold case. edx has 1s where there was a special value that
> > +          needs to be handled by a tanhf call. Optimize for code size
> > +          moreso than speed here. */
> > +L(SPECIAL_VALUES_BRANCH):
> > +       # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
> > +       /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
> > +       call entry will be 16-byte aligned. */
> > +       subq    $56, %rsp
>
> There is no CFI adjustment.

Think fixed up the cfi issued in all files in V2.
>
> > +
> > +       movups  %xmm0, 24(%rsp)
> > +       movups  %xmm12, 40(%rsp)
> > +
> > +       /* Use rbx/rbp for callee save registers as they get short
> > +       encoding for many instructions (as compared with r12/r13). */
> > +       movq    %rbx, (%rsp)
> > +       cfi_offset(rbx, -16)
>
> Is this CFI correct?
>
> > +       movq    %rbp, 8(%rsp)
> > +       cfi_offset(rbp, -8)
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a tanhf call.  */
> > +       movl    %edx, %ebx
> > +L(SPECIAL_VALUES_LOOP):
> > +       # LOE rbx rbp r12 r13 r14 r15
> > +       /* use rbp as index for special value that is saved across calls to
> > +          tanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop.  */
> > +       xorl    %ebp, %ebp
> > +       bsfl    %ebx, %ebp
> > +
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   40(%rsp, %rbp, 4), %xmm0
> > +       call    tanhf@PLT
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, 24(%rsp, %rbp, 4)
> > +
> > +       leal    -1(%rbx), %eax
> > +       andl    %eax, %ebx
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +       # LOE r12 r13 r14 r15
> > +       /* All results have been written to 16(%rsp).  */
> > +       movups  24(%rsp), %xmm0
> > +       movq    (%rsp), %rbx
> > +       cfi_restore(rbx)
> > +       movq    8(%rsp), %rbp
> > +       cfi_restore(rbp)
> > +       addq    $56, %rsp
> > +       ret
> > +END(_ZGVbN4v_tanhf_sse4)
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                     ` (5 preceding siblings ...)
  2022-06-09  0:05   ` [PATCH v2 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09 15:59   ` H.J. Lu
  2022-06-09 16:56     ` Noah Goldstein
  6 siblings, 1 reply; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 15:59 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvementss are:
>     1. Reduce code size (-64 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Reduce rodata size ([-128, -188] bytes).
>
> The throughput improvement is not significant as the port 0 bottleneck
> is unavoidable.
>         Function, New Time, Old Time, New / Old
> _ZGVeN16v_atanhf,     1.39,    1.408,     0.987
> ---
>  .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
>  1 file changed, 244 insertions(+), 230 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> index a1cd920a0f..3d808ac2bd 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> @@ -31,53 +31,50 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal_avx512
> - */
> -#define Log_tbl_H                      0
> -#define Log_tbl_L                      128
> -#define One                            256
> -#define AbsMask                                320
> -#define AddB5                          384
> -#define RcpBitMask                     448
> -#define poly_coeff3                    512
> -#define poly_coeff2                    576
> -#define poly_coeff1                    640
> -#define poly_coeff0                    704
> -#define Half                           768
> -#define L2H                            832
> -#define L2L                            896
> +/* Offsets for data table __svml_satanh_data_internal_avx512 and
> +   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
> +   function. On cold-starts this might help the prefetcher. Possibly
> +   a better idea is to interleave start/end so that the prefetcher is
> +   less likely to detect a stream and pull irrelivant lines into
> +   cache.  */
> +
> +/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
> +   the memory is broadcast to {1to16}.  */
> +#define AbsMask                                0
> +
> +/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
> +   is used here.  */
> +#define One                            0
> +#define AddB5                          64
> +#define RcpBitMask                     128
> +#define Log_tbl_L_lo                   192
> +#define Log_tbl_L_hi                   256
> +#define Log_tbl_H_lo                   320
> +#define Log_tbl_H_hi                   384
> +#define L2H                            448
> +#define L2L                            512
> +#define poly_coeff3                    576
> +#define poly_coeff2                    640
> +#define poly_coeff1                    704
>
>  #include <sysdep.h>
>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal_avx512_al64)
> +
>         .section .text.exex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_atanhf_skx)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-64, %rsp
> -       subq    $192, %rsp
> -       vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> -
> -       /* round reciprocals to 1+5b mantissas */
> -       vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> -       vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> -       vmovaps %zmm0, %zmm11
> -       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> +       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
> +       vmovups ATANHF_DATA(One)(%rip), %zmm4
>
>         /* 1+y */
>         vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
>
>         /* 1-y */
>         vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> -       vxorps  %zmm6, %zmm11, %zmm10
> -
> -       /* Yp_high */
> -       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
>
> -       /* -Ym_high */
> -       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +       /* round reciprocals to 1+5b mantissas */
> +       vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
> +       vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
>
>         /* RcpP ~ 1/Yp */
>         vrcp14ps %zmm9, %zmm12
> @@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
>         /* RcpM ~ 1/Ym */
>         vrcp14ps %zmm8, %zmm13
>
> +       /* Yp_high */
> +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> +
> +       /* -Ym_high */
> +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +
> +
>         /* input outside (-1, 1) ? */
> -       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
>         vpaddd  %zmm14, %zmm12, %zmm15
> -       vpaddd  %zmm14, %zmm13, %zmm0
> +       vpaddd  %zmm14, %zmm13, %zmm12
>
>         /* Yp_low */
>         vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
>         vandps  %zmm1, %zmm15, %zmm7
> -       vandps  %zmm1, %zmm0, %zmm12
> +       vandps  %zmm1, %zmm12, %zmm12
>
>         /* Ym_low */
>         vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> @@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
>         vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
>
>         /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> -       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> -       vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> -       vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> +       vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
> +
> +       vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
> +       vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
>
>         /* exponents */
> -       vgetexpps {sae}, %zmm7, %zmm15
>         vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> +       vgetexpps {sae}, %zmm7, %zmm15
> +
>
>         /* Table lookups */
> -       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
>         vgetexpps {sae}, %zmm12, %zmm14
> -       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> +
>
>         /* Prepare table index */
>         vpsrld  $18, %zmm7, %zmm3
>         vpsrld  $18, %zmm12, %zmm2
> -       vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -       vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> -
> +       vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
> +       vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
>         /* Km-Kp */
> +
> +       vmovaps %zmm3, %zmm5
> +       vpermi2ps %zmm13, %zmm10, %zmm3
> +       vpermt2ps %zmm13, %zmm2, %zmm10
> +       vpermi2ps %zmm7, %zmm11, %zmm5
> +       vpermt2ps %zmm7, %zmm2, %zmm11
>         vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> -       kmovw   %k0, %edx
> -       vmovaps %zmm3, %zmm0
> -       vpermi2ps %zmm13, %zmm8, %zmm3
> -       vpermt2ps %zmm13, %zmm2, %zmm8
> -       vpermi2ps %zmm7, %zmm6, %zmm0
> -       vpermt2ps %zmm7, %zmm2, %zmm6
> -       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> +       vsubps  {rn-sae}, %zmm3, %zmm10, %zmm7
>
>         /* K*L2H + Th */
> -       vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> +       vmovups ATANHF_DATA(L2H)(%rip), %zmm2
>
>         /* K*L2L + Tl */
> -       vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -
> -       /* polynomials */
> -       vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -       vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> +       vmovups ATANHF_DATA(L2L)(%rip), %zmm3
>
>         /* table values */
> -       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> -       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> -       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> -       vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -       vmovaps %zmm3, %zmm2
> -       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> -       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> +       vsubps  {rn-sae}, %zmm5, %zmm11, %zmm5
> +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
> +       vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
> +       /* polynomials */
> +       vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
> +       vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
> +       vmovaps %zmm10, %zmm14
> +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
> +       vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
> +       vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
>
>         /* (K*L2L + Tl) + Rp*PolyP */
> -       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> -       vorps   Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
> +
> +       /* zmm12 = zmm12 & (zmm4 | zmm0).  */
> +       vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
>
>         /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> -       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> -       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> -       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> +       vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
> +       vaddps  {rn-sae}, %zmm14, %zmm10, %zmm8
> +
> +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> +       kmovw   %k0, %edx
>         testl   %edx, %edx
>
>         /* Go to special inputs processing branch */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> +       # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm0
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> -
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> +       /* No register to restore on fast path.  */
>         ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          moreso than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       vmovups %zmm11, 64(%rsp)
> -       vmovups %zmm0, 128(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0
> -
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> -
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> -
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Special inputs
> -        * processing loop
> -        */
> +       /* Align stack and make room for 2x zmm vectors.  */
> +       andq    $-64, %rsp
> +       addq    $-128, %rsp
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm1
> +       vmovaps %zmm1, (%rsp)
> +       vmovaps %zmm0, 64(%rsp)
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a atanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $16, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 128(%rsp), %zmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 zmm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          atanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   64(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
> -
> -       movss   %xmm0, 128(%rsp, %r14, 4)
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
> +       /* All results have been written to 64(%rsp).  */

The return value is loaded from (%rsp).   Should all results be
written to (%rsp)?

> +       vmovaps (%rsp), %zmm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVeN16v_atanhf_skx)
>
>         .section .rodata, "a"
> -       .align  64
> -
> +       .align  4
>  #ifdef __svml_satanh_data_internal_avx512_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> -       __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> +typedef struct{
> +       __declspec(align(4)) VUINT32 AbsMask[1][1];
>         __declspec(align(64)) VUINT32 One[16][1];
> -       __declspec(align(64)) VUINT32 AbsMask[16][1];
>         __declspec(align(64)) VUINT32 AddB5[16][1];
>         __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
> +       __declspec(align(64)) VUINT32 L2H[16][1];
> +       __declspec(align(64)) VUINT32 L2L[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff3[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff2[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> -       __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> -       __declspec(align(64)) VUINT32 Half[16][1];
> -       __declspec(align(64)) VUINT32 L2H[16][1];
> -       __declspec(align(64)) VUINT32 L2L[16][1];
>  } __svml_satanh_data_internal_avx512;
>  #endif
>  __svml_satanh_data_internal_avx512:
> -       /* Log_tbl_H */
> -       .long   0x00000000
> -       .long   0x3cfc0000
> -       .long   0x3d780000
> -       .long   0x3db78000
> -       .long   0x3df10000
> -       .long   0x3e14c000
> -       .long   0x3e300000
> -       .long   0x3e4a8000
> -       .long   0x3e648000
> -       .long   0x3e7dc000
> -       .long   0x3e8b4000
> -       .long   0x3e974000
> -       .long   0x3ea30000
> -       .long   0x3eae8000
> -       .long   0x3eb9c000
> -       .long   0x3ec4e000
> -       .long   0x3ecfa000
> -       .long   0x3eda2000
> -       .long   0x3ee48000
> -       .long   0x3eeea000
> -       .long   0x3ef8a000
> -       .long   0x3f013000
> -       .long   0x3f05f000
> -       .long   0x3f0aa000
> -       .long   0x3f0f4000
> -       .long   0x3f13d000
> -       .long   0x3f184000
> -       .long   0x3f1ca000
> -       .long   0x3f20f000
> -       .long   0x3f252000
> -       .long   0x3f295000
> -       .long   0x3f2d7000
> -       /* Log_tbl_L */
> +       /* Leave this at front so we can potentially save space due to
> +          smaller alignment constraint.  */
> +       .align  4
> +    /* AbsMask */
> +       .long   0x7fffffff
> +       .align  64
> +__svml_satanh_data_internal_avx512_al64:
> +       /* One */
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* AddB5 */
> +       .align  64
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       /* RcpBitMask */
> +       .align  64
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       /* Log_tbl_L_lo */
>         .align  64
>         .long   0x00000000
>         .long   0x3726c39e
> @@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
>         .long   0x38dedfac
>         .long   0x38ebfb5e
>         .long   0xb8e63c9f
> +       /* Log_tbl_L_hi */
> +       .align  64
>         .long   0xb85c1340
>         .long   0x38777bcd
>         .long   0xb6038656
> @@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
>         .long   0x38f85db0
>         .long   0x37b4996f
>         .long   0xb8bfb3ca
> -       /* One */
> +       /* Log_tbl_H_lo */
>         .align  64
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* AbsMask */
> +       .long   0x00000000
> +       .long   0x3cfc0000
> +       .long   0x3d780000
> +       .long   0x3db78000
> +       .long   0x3df10000
> +       .long   0x3e14c000
> +       .long   0x3e300000
> +       .long   0x3e4a8000
> +       .long   0x3e648000
> +       .long   0x3e7dc000
> +       .long   0x3e8b4000
> +       .long   0x3e974000
> +       .long   0x3ea30000
> +       .long   0x3eae8000
> +       .long   0x3eb9c000
> +       .long   0x3ec4e000
> +       /* Log_tbl_H_hi */
>         .align  64
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> -       /* AddB5 */
> +       .long   0x3ecfa000
> +       .long   0x3eda2000
> +       .long   0x3ee48000
> +       .long   0x3eeea000
> +       .long   0x3ef8a000
> +       .long   0x3f013000
> +       .long   0x3f05f000
> +       .long   0x3f0aa000
> +       .long   0x3f0f4000
> +       .long   0x3f13d000
> +       .long   0x3f184000
> +       .long   0x3f1ca000
> +       .long   0x3f20f000
> +       .long   0x3f252000
> +       .long   0x3f295000
> +       .long   0x3f2d7000
> +       /* L2H = log(2)_high */
>         .align  64
> -       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> -       /* RcpBitMask */
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       /* L2L = log(2)_low */
>         .align  64
> -       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
>         /* poly_coeff3 */
>         .align  64
> -       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
>         /* poly_coeff2 */
>         .align  64
> -       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
>         /* poly_coeff1 */
>         .align  64
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> -       /* poly_coeff0 */
> -       .align  64
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* Half */
> -       .align  64
> -       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> -       /* L2H = log(2)_high */
> -       .align  64
> -       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> -       /* L2L = log(2)_low */
> -       .align  64
> -       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
>         .align  64
> +       .type   __svml_satanh_data_internal_avx512_al64, @object
> +       .size   __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
>         .type   __svml_satanh_data_internal_avx512, @object
>         .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S
  2022-06-09  0:05   ` [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 16:01     ` H.J. Lu
  2022-06-09 16:56       ` Noah Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 16:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvements are:
>     1. Reduce code size (-60 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Shrink rodata usage (-32 bytes).
>
> The throughput improvement is not that significant (3-5%) as the
> port 0 bottleneck is unavoidable.
>
>        Function, New Time, Old Time, New / Old
> _ZGVdN8v_atanhf,    2.799,    2.923,     0.958
> ---
>  .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 405 +++++++++---------
>  1 file changed, 202 insertions(+), 203 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> index c1ea1c3353..6113d366c2 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> @@ -30,305 +30,304 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> +/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
> +   by use in the function. On cold-starts this might hhelp the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
>  #define SgnMask                                0
>  #define sOne                           32
> -#define sPoly                          64
> -#define iBrkValue                      320
> -#define iOffExpoMask                   352
> -#define sHalf                          384
> -#define sSign                          416
> -#define sTopMask12                     448
> -#define TinyRange                      480
> -#define sLn2                           512
> +#define sTopMask12                     64
> +#define TinyRange                      96
> +#define iBrkValue                      128
> +#define iOffExpoMask                   160
> +#define sPoly                          192
> +#define sLn2                           448
> +#define sHalf                          480
>
>  #include <sysdep.h>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal)
>
>         .section .text.avx2, "ax", @progbits
>  ENTRY(_ZGVdN8v_atanhf_avx2)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-32, %rsp
> -       subq    $96, %rsp
> -
> +       /* Strip off the sign, so treat X as positive until right at the end */
> +       vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2
> +       vandps  %ymm2, %ymm0, %ymm3
>         /* Load constants including One = 1 */
> -       vmovups sOne+__svml_satanh_data_internal(%rip), %ymm5
> -       vmovups sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
> -       vmovaps %ymm0, %ymm6
> +       vmovups ATANHF_DATA(sOne)(%rip), %ymm5
> +       vsubps  %ymm3, %ymm5, %ymm1
> +       vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4
>
> -       /* Strip off the sign, so treat X as positive until right at the end */
> -       vandps  SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
> -       vsubps  %ymm10, %ymm5, %ymm1
> +       vrcpps  %ymm1, %ymm7
> +       vsubps  %ymm1, %ymm5, %ymm9
> +       vandps  %ymm4, %ymm7, %ymm6
> +       vsubps  %ymm3, %ymm9, %ymm7
>
> -       /*
> -        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> -        * the upper part UHi being <= 12 bits long. Then we have
> -        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> -        */
> -       vaddps  %ymm10, %ymm10, %ymm14
> +       /* No need to split sU when FMA is available */
> +       vfnmadd213ps %ymm5, %ymm6, %ymm1
> +       vmovaps %ymm0, %ymm8
> +       vfmadd213ps %ymm0, %ymm0, %ymm0
> +       vfnmadd231ps %ymm6, %ymm7, %ymm1
>
>         /*
>          * Check whether |X| < 1, in which case we use the main function.
>          * Otherwise set the rangemask so that the callout will get used.
>          * Note that this will also use the callout for NaNs since not(NaN < 1).
>          */
> -       vcmpnlt_uqps %ymm5, %ymm10, %ymm7
> -       vsubps  %ymm1, %ymm5, %ymm9
> -       vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
> -       vrcpps  %ymm1, %ymm11
> -       vsubps  %ymm10, %ymm9, %ymm12
> -       vandps  %ymm13, %ymm11, %ymm0
> +       vcmpnlt_uqps %ymm5, %ymm3, %ymm14
> +       vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
>
> -       /* No need to split sU when FMA is available */
> -       vfnmadd213ps %ymm5, %ymm0, %ymm1
> -       vmovaps %ymm6, %ymm8
> -       vfmadd213ps %ymm6, %ymm6, %ymm8
> -       vfnmadd231ps %ymm0, %ymm12, %ymm1
> +       /*
> +        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> +        * the upper part UHi being <= 12 bits long. Then we have
> +        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> +        */
> +       vaddps  %ymm3, %ymm3, %ymm3
>
>         /*
>          * Split V as well into upper 12 bits and lower part, so that we can get
>          * a preliminary quotient estimate without rounding error.
>          */
> -       vandps  %ymm13, %ymm14, %ymm15
> -       vmovmskps %ymm7, %edx
> -       vsubps  %ymm15, %ymm14, %ymm7
> +       vandps  %ymm4, %ymm3, %ymm4
> +       vsubps  %ymm4, %ymm3, %ymm7
>
>         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> -       vmulps  %ymm15, %ymm0, %ymm10
> +       vmulps  %ymm4, %ymm6, %ymm4
>
>         /* Compute D = E + E^2 */
>         vfmadd213ps %ymm1, %ymm1, %ymm1
>
> -       /* Record the sign for eventual reincorporation. */
> -       vandps  sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
> +       /* Record the sign for eventual reincorporation.  */
> +       vandnps %ymm8, %ymm2, %ymm3
>
>         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> -       vorps   %ymm3, %ymm8, %ymm2
> -       vmulps  %ymm7, %ymm0, %ymm8
> +       vorps   %ymm3, %ymm0, %ymm13
> +       vmulps  %ymm7, %ymm6, %ymm2
>
>         /*
>          * Compute R * (VHi + VLo) * (1 + E + E^2)
>          * = R *  (VHi + VLo) * (1 + D)
>          * = QHi + (QHi * D + QLo + QLo * D)
>          */
> -       vmulps  %ymm1, %ymm10, %ymm9
> -       vfmadd213ps %ymm8, %ymm8, %ymm1
> -       vaddps  %ymm1, %ymm9, %ymm1
>
> -       /* reduction: compute r, n */
> -       vmovups iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
> +       /*
> +        * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
> +        * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
> +        * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
> +        */
> +       vmulps  %ymm1, %ymm4, %ymm6
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
> +       vaddps  %ymm1, %ymm6, %ymm1
>
>         /*
>          * Now finally accumulate the high and low parts of the
>          * argument to log1p, H + L, with a final compensated summation.
>          */
> -       vaddps  %ymm1, %ymm10, %ymm12
> -       vsubps  %ymm12, %ymm10, %ymm11
> +       vaddps  %ymm1, %ymm4, %ymm2
> +
> +       /* reduction: compute r, n */
> +       vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9
>
>         /*
>          * Now we feed into the log1p code, using H in place of _VARG1 and
>          * later incorporating L into the reduced argument.
>          * compute 1+x as high, low parts
>          */
> -       vmaxps  %ymm12, %ymm5, %ymm13
> -       vminps  %ymm12, %ymm5, %ymm14
> -       vaddps  %ymm11, %ymm1, %ymm0
> -       vaddps  %ymm14, %ymm13, %ymm1
> -       vpsubd  %ymm9, %ymm1, %ymm7
> -       vsubps  %ymm1, %ymm13, %ymm15
> -       vpsrad  $23, %ymm7, %ymm10
> -       vpand   iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
> -       vaddps  %ymm15, %ymm14, %ymm13
> -       vpslld  $23, %ymm10, %ymm11
> -       vpaddd  %ymm9, %ymm8, %ymm15
> -       vaddps  %ymm13, %ymm0, %ymm14
> -       vcvtdq2ps %ymm10, %ymm0
> -       vpsubd  %ymm11, %ymm5, %ymm12
> +       vmaxps  %ymm2, %ymm5, %ymm0
> +       vminps  %ymm2, %ymm5, %ymm6
> +
> +       /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
> +       vsubps  %ymm2, %ymm4, %ymm2
> +       vaddps  %ymm6, %ymm0, %ymm4
> +       vpsubd  %ymm9, %ymm4, %ymm7
> +       vsubps  %ymm4, %ymm0, %ymm4
> +       vaddps  %ymm2, %ymm1, %ymm2
> +       vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
> +
> +       vandps  %ymm1, %ymm7, %ymm0
> +       vaddps  %ymm4, %ymm6, %ymm4
> +       vandnps %ymm7, %ymm1, %ymm6
> +       vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1
> +       vpaddd  %ymm9, %ymm0, %ymm0
> +       vaddps  %ymm4, %ymm2, %ymm4
> +       vpsubd  %ymm6, %ymm5, %ymm6
>
>         /* polynomial evaluation */
> -       vsubps  %ymm5, %ymm15, %ymm5
> -       vmulps  %ymm14, %ymm12, %ymm1
> -       vaddps  %ymm5, %ymm1, %ymm5
> -       vmovups sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
> -       vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vmulps  %ymm1, %ymm5, %ymm7
> -       vfmadd213ps %ymm5, %ymm5, %ymm7
> +       vsubps  %ymm5, %ymm0, %ymm2
> +       vfmadd231ps %ymm4, %ymm6, %ymm2
> +       vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
> +
> +       vmulps  %ymm1, %ymm2, %ymm1
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
>
>         /* final reconstruction */
> -       vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
> +       vpsrad  $23, %ymm7, %ymm6
> +       vcvtdq2ps %ymm6, %ymm2
> +       vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
>
>         /* Finally, halve the result and reincorporate the sign */
> -       vxorps  sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
> -       vmulps  %ymm0, %ymm3, %ymm0
> -       vblendvps %ymm4, %ymm2, %ymm0, %ymm0
> +       vxorps  ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
> +       vmulps  %ymm2, %ymm3, %ymm2
> +       vmovmskps %ymm14, %edx
>         testl   %edx, %edx
>
> +       vblendvps %ymm15, %ymm13, %ymm2, %ymm0
>         /* Go to special inputs processing branch */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
> -
> -       /* Restore registers
> -        * and exit the function
> -        */
> -
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0
> +       /* No registers to restore on fast path.  */
>         ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
>
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          moreso than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       vmovups %ymm6, 32(%rsp)
> -       vmovups %ymm0, 64(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx ymm0
> -
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> -
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       /* Align stack and make room for 2x ymm vectors.  */
> +       andq    $-32, %rsp
> +       addq    $-64, %rsp
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> +       /* Save all already computed inputs.  */
> +       vmovups %ymm0, (%rsp)
> +       /* Save origional input (ymm8 unchanged up to this point).  */
> +       vmovups %ymm8, 32(%rsp)
>
> -       /* Special inputs
> -        * processing loop
> -        */
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a atanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $8, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 64(%rsp), %ymm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 ymm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          atanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   32(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       /* All results have been written to 32(%rsp).  */
                                                                Why 32
here?  Did you mean 32 bytes at %rsp?
> +       vmovups (%rsp), %ymm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVdN8v_atanhf_avx2)
>
>         .section .rodata, "a"
>         .align  32
> -
>  #ifdef __svml_satanh_data_internal_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> +typedef struct{
>         __declspec(align(32)) VUINT32 SgnMask[8][1];
>         __declspec(align(32)) VUINT32 sOne[8][1];
> -       __declspec(align(32)) VUINT32 sPoly[8][8][1];
> -       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> -       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> -       __declspec(align(32)) VUINT32 sHalf[8][1];
> -       __declspec(align(32)) VUINT32 sSign[8][1];
>         __declspec(align(32)) VUINT32 sTopMask12[8][1];
>         __declspec(align(32)) VUINT32 TinyRange[8][1];
> +       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> +       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> +       __declspec(align(32)) VUINT32 sPoly[8][8][1];
>         __declspec(align(32)) VUINT32 sLn2[8][1];
> +       __declspec(align(32)) VUINT32 sHalf[8][1];
>  } __svml_satanh_data_internal;
>  #endif
>  __svml_satanh_data_internal:
>         /* SgnMask */
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
>         /* sOne = SP 1.0 */
>         .align  32
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* sPoly[] = SP polynomial */
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* sTopMask12 */
> +       .align  32
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       /* TinyRange */
>         .align  32
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> -       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> -       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> -       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> -       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> -       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> -       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> -       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
>         /* iBrkValue = SP 2/3 */
>         .align  32
> -       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
>         /* iOffExpoMask = SP significand mask */
>         .align  32
> -       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> -       /* sHalf */
> -       .align  32
> -       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> -       /* sSign */
> -       .align  32
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       /* sTopMask12 */
> -       .align  32
> -       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> -       /* TinyRange */
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       /* sPoly[] = SP polynomial */
>         .align  32
> -       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
>         /* sLn2 = SP ln(2) */
>         .align  32
> -       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       /* sHalf */
> +       .align  32
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
>         .align  32
>         .type   __svml_satanh_data_internal, @object
>         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 3/7] x86: Improve svml_s_atanhf4_core_sse4.S
  2022-06-09  0:05   ` [PATCH v2 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09 16:03     ` H.J. Lu
  2022-06-09 16:56       ` Noah Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 16:03 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvements are:
>     1. Reduce code size (-62 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Reduce rodata usage (-16 bytes).
>
> The throughput improvement is not significant as the port 0 bottleneck
> is unavoidable.
>
>        Function, New Time, Old Time, New / Old
> _ZGVbN4v_atanhf,    8.821,    8.903,     0.991
> ---
>  .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 378 ++++++++----------
>  1 file changed, 169 insertions(+), 209 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> index 2d3ad2617f..e6683785fb 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> @@ -30,96 +30,80 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> -#define SgnMask                                0
> -#define sOne                           16
> -#define sPoly                          32
> -#define iBrkValue                      160
> -#define iOffExpoMask                   176
> -#define sHalf                          192
> -#define sSign                          208
> -#define sTopMask12                     224
> -#define TinyRange                      240
> -#define sLn2                           256
> +/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
> +   by use in the function. On cold-starts this might help the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
> +#define sOne                           0
> +#define SgnMask                                16
> +#define sTopMask12                     32
> +#define iBrkValue                      48
> +#define iOffExpoMask                   64
> +#define sPoly                          80
> +#define sLn2                           208
> +#define TinyRange                      224
>
>  #include <sysdep.h>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal)
>
>         .section .text.sse4, "ax", @progbits
>  ENTRY(_ZGVbN4v_atanhf_sse4)
> -       subq    $72, %rsp
> -       cfi_def_cfa_offset(80)
>         movaps  %xmm0, %xmm5
>
>         /* Load constants including One = 1 */
> -       movups  sOne+__svml_satanh_data_internal(%rip), %xmm4
> +       movups  ATANHF_DATA(sOne)(%rip), %xmm4
>         movaps  %xmm5, %xmm3
>
>         /* Strip off the sign, so treat X as positive until right at the end */
> -       movups  SgnMask+__svml_satanh_data_internal(%rip), %xmm7
> -       movaps  %xmm4, %xmm8
> -       andps   %xmm5, %xmm7
> +       movups  ATANHF_DATA(SgnMask)(%rip), %xmm1
> +       movaps  %xmm4, %xmm2
> +       andps   %xmm1, %xmm0
>         movaps  %xmm4, %xmm10
> -       movups  sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
> +       movups  ATANHF_DATA(sTopMask12)(%rip), %xmm11
>         movaps  %xmm4, %xmm14
>         movaps  %xmm11, %xmm9
>
> +
>         /*
>          * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
>          * the upper part UHi being <= 12 bits long. Then we have
>          * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
>          */
> -       movaps  %xmm7, %xmm12
> +       movaps  %xmm0, %xmm6
> +       mulps   %xmm5, %xmm3
> +       subps   %xmm0, %xmm2
> +       addps   %xmm0, %xmm6
> +       subps   %xmm2, %xmm10
> +       addps   %xmm5, %xmm3
> +       subps   %xmm0, %xmm10
> +       andps   %xmm2, %xmm9
> +
>
>         /*
>          * Check whether |X| < 1, in which case we use the main function.
>          * Otherwise set the rangemask so that the callout will get used.
>          * Note that this will also use the callout for NaNs since not(NaN < 1).
>          */
> -       movaps  %xmm7, %xmm6
> -       movaps  %xmm7, %xmm2
> -       cmpnltps %xmm4, %xmm6
> -       cmpltps TinyRange+__svml_satanh_data_internal(%rip), %xmm2
> -       mulps   %xmm5, %xmm3
> -       subps   %xmm7, %xmm8
> -       addps   %xmm7, %xmm12
> -       movmskps %xmm6, %edx
> -       subps   %xmm8, %xmm10
> -       addps   %xmm5, %xmm3
> -       subps   %xmm7, %xmm10
> -       andps   %xmm8, %xmm9
> +       rcpps   %xmm9, %xmm7
> +       subps   %xmm9, %xmm2
> +       andps   %xmm11, %xmm7
>
> -       /*
> -        * Now we feed into the log1p code, using H in place of _VARG1 and
> -        * later incorporating L into the reduced argument.
> -        * compute 1+x as high, low parts
> -        */
> -       movaps  %xmm4, %xmm7
> -
> -       /*
> -        * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
> -        * The first FMR is exact (we force R to 12 bits just in case it
> -        * isn't already, to make absolutely sure), and since E is ~ 2^-12,
> -        * the rounding error in the other one is acceptable.
> -        */
> -       rcpps   %xmm9, %xmm15
> -       subps   %xmm9, %xmm8
> -       andps   %xmm11, %xmm15
>
>         /*
>          * Split V as well into upper 12 bits and lower part, so that we can get
>          * a preliminary quotient estimate without rounding error.
>          */
> -       andps   %xmm12, %xmm11
> -       mulps   %xmm15, %xmm9
> -       addps   %xmm8, %xmm10
> -       subps   %xmm11, %xmm12
> +       andps   %xmm6, %xmm11
> +       mulps   %xmm7, %xmm9
> +       addps   %xmm2, %xmm10
> +       subps   %xmm11, %xmm6
>
>         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> -       mulps   %xmm15, %xmm11
> -       mulps   %xmm15, %xmm10
> +       mulps   %xmm7, %xmm11
> +       mulps   %xmm7, %xmm10
>         subps   %xmm9, %xmm14
> -       mulps   %xmm12, %xmm15
> +       mulps   %xmm6, %xmm7
>         subps   %xmm10, %xmm14
>
>         /* Compute D = E + E^2 */
> @@ -127,8 +111,8 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
>         movaps  %xmm4, %xmm8
>         mulps   %xmm14, %xmm13
>
> -       /* reduction: compute r, n */
> -       movdqu  iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
> +       /* reduction: compute r,n */
> +       movdqu  ATANHF_DATA(iBrkValue)(%rip), %xmm9
>         addps   %xmm13, %xmm14
>
>         /*
> @@ -136,168 +120,149 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
>          * = R *  (VHi + VLo) * (1 + D)
>          * = QHi + (QHi * D + QLo + QLo * D)
>          */
> -       movaps  %xmm14, %xmm0
> -       mulps   %xmm15, %xmm14
> -       mulps   %xmm11, %xmm0
> -       addps   %xmm14, %xmm15
> -       movdqu  iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
> +       movaps  %xmm14, %xmm2
> +       mulps   %xmm7, %xmm14
> +       mulps   %xmm11, %xmm2
> +       addps   %xmm14, %xmm7
> +       movdqu  ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
>         movaps  %xmm4, %xmm14
>
>         /* Record the sign for eventual reincorporation. */
> -       movups  sSign+__svml_satanh_data_internal(%rip), %xmm1
> -       addps   %xmm15, %xmm0
> +       addps   %xmm7, %xmm2
> +
>
>         /*
>          * Now finally accumulate the high and low parts of the
>          * argument to log1p, H + L, with a final compensated summation.
>          */
> -       movaps  %xmm0, %xmm6
> -       andps   %xmm5, %xmm1
> -
> +       movaps  %xmm2, %xmm6
> +       andnps  %xmm5, %xmm1
> +       movaps  %xmm4, %xmm7
>         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> -       orps    %xmm1, %xmm3
>         addps   %xmm11, %xmm6
>         maxps   %xmm6, %xmm7
>         minps   %xmm6, %xmm8
>         subps   %xmm6, %xmm11
>         movaps  %xmm7, %xmm10
> -       andps   %xmm2, %xmm3
>         addps   %xmm8, %xmm10
> -       addps   %xmm11, %xmm0
> +       addps   %xmm11, %xmm2
>         subps   %xmm10, %xmm7
>         psubd   %xmm9, %xmm10
> -       addps   %xmm7, %xmm8
> +       addps   %xmm8, %xmm7
>         pand    %xmm10, %xmm12
>         psrad   $23, %xmm10
>         cvtdq2ps %xmm10, %xmm13
> -       addps   %xmm8, %xmm0
> +       addps   %xmm7, %xmm2
>
>         /* final reconstruction */
> -       mulps   sLn2+__svml_satanh_data_internal(%rip), %xmm13
>         pslld   $23, %xmm10
>         paddd   %xmm9, %xmm12
>         psubd   %xmm10, %xmm14
>
>         /* polynomial evaluation */
>         subps   %xmm4, %xmm12
> -       mulps   %xmm0, %xmm14
> -       movups  sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
> -       addps   %xmm12, %xmm14
> -       mulps   %xmm14, %xmm0
> +       mulps   %xmm14, %xmm2
> +       movups  ATANHF_DATA(sPoly+0)(%rip), %xmm7
> +       addps   %xmm12, %xmm2
> +       mulps   %xmm2, %xmm7
> +
>
>         /* Finally, halve the result and reincorporate the sign */
> -       movups  sHalf+__svml_satanh_data_internal(%rip), %xmm4
> -       pxor    %xmm1, %xmm4
> -       addps   sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   %xmm0, %xmm14
> -       movaps  %xmm2, %xmm0
> -       addps   %xmm13, %xmm14
> -       mulps   %xmm14, %xmm4
> -       andnps  %xmm4, %xmm0
> -       orps    %xmm3, %xmm0
> -       testl   %edx, %edx
> +       addps   ATANHF_DATA(sPoly+16)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+32)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+48)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+64)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+80)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+96)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       movaps  ATANHF_DATA(sPoly+112)(%rip), %xmm6
> +       addps   %xmm6, %xmm7
> +       mulps   %xmm2, %xmm7
> +       mulps   %xmm2, %xmm7
> +       mulps   ATANHF_DATA(sLn2)(%rip), %xmm13
> +       /* We can build `sHalf` with `sPoly & sOne`.  */
> +       andps   %xmm4, %xmm6
> +       orps    %xmm1, %xmm3
> +       xorps   %xmm6, %xmm1
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
> +       addps   %xmm2, %xmm7
> +       addps   %xmm13, %xmm7
> +       mulps   %xmm7, %xmm1
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       /* Finish check of NaNs.  */
> +       cmpleps %xmm0, %xmm4
> +       movmskps %xmm4, %edx
> +       cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
>
> -L(EXIT):
> -       addq    $72, %rsp
> -       cfi_def_cfa_offset(8)
> +       andps   %xmm0, %xmm3
> +       andnps  %xmm1, %xmm0
> +       orps    %xmm3, %xmm0
> +
> +       testl   %edx, %edx
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       /* No registers to restore on fast path.  */
>         ret
> -       cfi_def_cfa_offset(80)
>
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          moreso than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       movups  %xmm5, 32(%rsp)
> -       movups  %xmm0, 48(%rsp)
> -       # LOE rbx rbp r12 r13 r14 r15 edx
> -
> -       xorl    %eax, %eax
> -       movq    %r12, 16(%rsp)
> -       cfi_offset(12, -64)
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       cfi_offset(13, -72)
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> -
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> -
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       /* Special inputs
> -        * processing loop
> -        */
> -
> +       # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
> +       /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
> +       call entry will be 16-byte aligned. */
> +       subq    $56, %rsp
> +       cfi_def_cfa_offset(64)
> +       movups  %xmm0, 24(%rsp)
> +       movups  %xmm5, 40(%rsp)
> +
> +       /* Use rbx/rbp for callee save registers as they get short
> +       encoding for many instructions (as compared with r12/r13). */
> +       movq    %rbx, (%rsp)
> +       cfi_offset(rbx, -64)
> +       movq    %rbp, 8(%rsp)
> +       cfi_offset(rbp, -56)
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $4, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       movups  48(%rsp), %xmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       cfi_offset(12, -64)
> -       cfi_offset(13, -72)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r12 r13 r14 r15 xmm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %ebp, %ebp
> +       bsfl    %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   40(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx rbp r14 r15 r12d r13d xmm0
> -
> -       movss   %xmm0, 48(%rsp, %r14, 4)
> -
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx rbp r15 r12d r13d
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, 24(%rsp, %rbp, 4)
> +
> +       leal    -1(%rbx), %eax
> +       andl    %eax, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +       /* All results have been written to 16(%rsp).  */

Where does 16 come from?
> +       movups  24(%rsp), %xmm0
> +       movq    (%rsp), %rbx
> +       cfi_restore(rbx)
> +       movq    8(%rsp), %rbp
> +       cfi_restore(rbp)
> +       addq    $56, %rsp
> +       cfi_def_cfa_offset(8)
> +       ret
>  END(_ZGVbN4v_atanhf_sse4)
>
>         .section .rodata, "a"
> @@ -305,56 +270,51 @@ END(_ZGVbN4v_atanhf_sse4)
>
>  #ifdef __svml_satanh_data_internal_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(16)) VUINT32 SgnMask[4][1];
> +typedef struct{
>         __declspec(align(16)) VUINT32 sOne[4][1];
> -       __declspec(align(16)) VUINT32 sPoly[8][4][1];
> +       __declspec(align(16)) VUINT32 SgnMask[4][1];
> +       __declspec(align(16)) VUINT32 sTopMask12[4][1];
>         __declspec(align(16)) VUINT32 iBrkValue[4][1];
>         __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
> -       __declspec(align(16)) VUINT32 sHalf[4][1];
> -       __declspec(align(16)) VUINT32 sSign[4][1];
> -       __declspec(align(16)) VUINT32 sTopMask12[4][1];
> -       __declspec(align(16)) VUINT32 TinyRange[4][1];
> +       __declspec(align(16)) VUINT32 sPoly[8][4][1];
>         __declspec(align(16)) VUINT32 sLn2[4][1];
> +       __declspec(align(16)) VUINT32 TinyRange[4][1];
>  } __svml_satanh_data_internal;
>  #endif
> +
>  __svml_satanh_data_internal:
> -       /* SgnMask */
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
>         /* sOne = SP 1.0 */
>         .align  16
>         .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* sPoly[] = SP polynomial */
> +       /* SgnMask */
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       /* sTopMask12 */
>         .align  16
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> -       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> -       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> -       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> -       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> -       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> -       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> -       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
>         /* iBrkValue = SP 2/3 */
>         .align  16
>         .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> -       /* iOffExpoMask = SP significand mask */
> +       /* iOffExpoMask = SP significand mask ==*/
>         .align  16
>         .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> -       /* sHalf */
> -       .align  16
> -       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> -       /* sSign */
> +
> +       /* sPoly[] = SP polynomial */
>         .align  16
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       /* sTopMask12 */
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> +
> +       /* sLn2 = SP ln(2) */
>         .align  16
> -       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
>         /* TinyRange */
>         .align  16
>         .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> -       /* sLn2 = SP ln(2) */
> -       .align  16
> -       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
>         .align  16
>         .type   __svml_satanh_data_internal, @object
>         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S
  2022-06-09  0:05   ` [PATCH v2 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
@ 2022-06-09 16:04     ` H.J. Lu
  2022-06-09 16:57       ` Noah Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 16:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Reduce code size (-67 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Reduce rodata usage (-448 bytes).
>
> Result is roughly a 14% speedup:
>
>        Function, New Time, Old Time, New / Old
> _ZGVeN16v_tanhf,    0.649,    0.752,     0.863
> ---
>  .../multiarch/svml_s_tanhf16_core_avx512.S    | 527 ++++++++++--------
>  1 file changed, 287 insertions(+), 240 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> index 5b1f9f151c..d55798767c 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> @@ -70,310 +70,357 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> +/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> +   by use in the function. On cold-starts this might help the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
> +
> +/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
>   */
> -#define _sC                            0
> -#define _sP0                           128
> -#define _sP2                           256
> -#define _sP3                           384
> -#define _sP4                           512
> -#define _sP5                           640
> -#define _sP6                           768
> -#define _sP7                           896
> -#define _iExpMantMask_UISA             1024
> -#define _iMinIdxOfsMask_UISA           1088
> -#define _iMaxIdxMask_UISA              1152
> -#define _sSignMask                     1216
> -#define _sAbsMask                      1280
> -#define _iExpMantMask                  1344
> -#define _iExpMask                      1408
> -#define _iMinIdxOfsMask                        1472
> -#define _iMaxIdxMask                   1536
> +#define _iExpMantMask_UISA             0
> +#define _iMinIdxOfsMask_UISA           4
> +#define _iMaxIdxMask_UISA              8
> +#define _iExpMask                      12
> +
> +/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> +   each.  */
> +#define _sC_lo                         0
> +#define _sC_hi                         64
> +#define _sP7_lo                                128
> +#define _sP7_hi                                192
> +#define _sSignMask                     256
> +#define _sP6_lo                                320
> +#define _sP6_hi                                384
> +#define _sP5_lo                                448
> +#define _sP5_hi                                512
> +#define _sP4_lo                                576
> +#define _sP4_hi                                640
> +#define _sP3_lo                                704
> +#define _sP3_hi                                768
> +#define _sP2_lo                                832
> +#define _sP2_hi                                896
> +#define _sP0_lo                                960
> +#define _sP0_hi                                1024
>
>  #include <sysdep.h>
> +#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> +#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
>
>         .section .text.exex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_tanhf_skx)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-64, %rsp
> -       subq    $192, %rsp
> -       vmovaps %zmm0, %zmm1
> -       vmovups __svml_stanh_data_internal(%rip), %zmm9
> -       vmovups _sP6+__svml_stanh_data_internal(%rip), %zmm11
> -       vmovups _sP5+__svml_stanh_data_internal(%rip), %zmm12
> -       vmovups _sP4+__svml_stanh_data_internal(%rip), %zmm13
> -       vmovups _sP3+__svml_stanh_data_internal(%rip), %zmm14
> -       vmovups _sP2+__svml_stanh_data_internal(%rip), %zmm15
> -       vpternlogd $255, %zmm2, %zmm2, %zmm2
> -       vandps  _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> -       vandps  _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> -
>         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       vpandd  _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> -       vpsubd  _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> -       vpcmpd  $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> +       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> +       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
>
> -       /*
> -        *  small table specific variables *
> -        *  Constant loading
> -        */
> -       vpxord  %zmm5, %zmm5, %zmm5
> -
> -       /* if VMIN, VMAX is defined for I type */
> -       vpmaxsd %zmm5, %zmm4, %zmm6
> -       vpminsd _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> -       vpsrld  $21, %zmm7, %zmm10
> -       vmovups _sP7+__svml_stanh_data_internal(%rip), %zmm4
> -       vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> -       vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> -       vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> -       vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> -       vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> -       vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> -       vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> -       vpandnd %zmm3, %zmm3, %zmm2{%k1}
> -       vptestmd %zmm2, %zmm2, %k0
> -       vmovups _sP0+__svml_stanh_data_internal(%rip), %zmm3
> -       vsubps  {rn-sae}, %zmm9, %zmm8, %zmm2
> -       kmovw   %k0, %edx
> -       vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> -       vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> -       vorps   %zmm0, %zmm4, %zmm0
> -       testl   %edx, %edx
> +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> +       vpxord  %zmm3, %zmm3, %zmm3
> +       vpmaxsd %zmm3, %zmm2, %zmm3
> +       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> +       /* Setup permute indices in zmm3.  */
> +       vpsrld  $21, %zmm3, %zmm3
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       /* Store if there are any special cases in k1.  */
> +       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
>
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> -       ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
>
> -       /* Branch to process
> -        * special inputs
> -        */
> +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
>
> -L(SPECIAL_VALUES_BRANCH):
> -       vmovups %zmm1, 64(%rsp)
> -       vmovups %zmm0, 128(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0
> +       /* Store absolute values of inputs in zmm1.  */
> +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> +       vandnps %zmm0, %zmm4, %zmm1
> +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
>
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
>
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> +
> +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
>
> -       /* Special inputs
> -        * processing loop
> +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> +
> +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> +
> +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> +
> +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> +
> +       kmovw   %k1, %edx
> +       testl   %edx, %edx
> +
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> +       /* Wait until after branch of write over zmm0.  */
> +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> +
> +       /* No stack restoration on the fastpath.  */
> +       ret
> +
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a tanhf call. Optimize for code size
> +          moreso than speed here. */
> +L(SPECIAL_VALUES_BRANCH):
> +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
> +
> +       /* Align stack and make room for 2x zmm vectors.  */
> +       andq    $-64, %rsp
> +       addq    $-128, %rsp
> +
> +       /* Save origional input (zmm0 unchanged up to this point).  */
> +       vmovaps %zmm0, 64(%rsp)
> +       /* Save all already computed inputs.  */
> +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> +       vmovaps %zmm0, (%rsp)
>
> +       vzeroupper
> +
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $16, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 128(%rsp), %zmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 zmm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
>
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   64(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
> +       /* All results have been written to 64(%rsp).  */
                                                              Should
64 be removed?
> +       vmovaps (%rsp), %zmm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVeN16v_tanhf_skx)
>
>         .section .rodata, "a"
> -       .align  64
> -
> +       .align  16
>  #ifdef __svml_stanh_data_internal_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(64)) VUINT32 _sC[32][1];
> -       __declspec(align(64)) VUINT32 _sP0[32][1];
> -       __declspec(align(64)) VUINT32 _sP2[32][1];
> -       __declspec(align(64)) VUINT32 _sP3[32][1];
> -       __declspec(align(64)) VUINT32 _sP4[32][1];
> -       __declspec(align(64)) VUINT32 _sP5[32][1];
> -       __declspec(align(64)) VUINT32 _sP6[32][1];
> -       __declspec(align(64)) VUINT32 _sP7[32][1];
> -       __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> -       __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> -       __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> +typedef struct
> +       {
> +       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> +       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> +       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> +       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> +       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
>         __declspec(align(64)) VUINT32 _sSignMask[16][1];
> -       __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> -       __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> -       __declspec(align(64)) VUINT32 _iExpMask[16][1];
> -       __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> -       __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> +       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
>  } __svml_stanh_data_internal;
>  #endif
> +
>  __svml_stanh_data_internal:
> -       /* _sC */
> +       .align  4
> +       /* _iExpMantMask_UISA */
> +       .long   0x7fe00000
> +
> +       .align  4
> +       /* _iMinIdxOfsMask_UISA */
> +       .long   0x3d400000
> +
> +       .align  4
> +       /* _iMaxIdxMask_UISA */
> +       .long   0x03e00000
> +
> +       .align  4
> +       /* _iExpMask */
> +       .long   0x7f000000
> +
> +       .align  64
> +__svml_stanh_data_internal_al64:
> +       .align  64
> +       /* _sC_lo */
>         .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
>         .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
>         .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
>         .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> +
> +       .align  64
> +       /* _sC_hi */
>         .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
>         .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
>         .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
>         .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> -       /* p0 */
> -       .align  64
> -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> -       /* p2 */
> -       .align  64
> -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> -       /* p3 */
> +
>         .align  64
> -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> -       /* p4 */
> +       /* _sP7_lo */
> +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> +
>         .align  64
> -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> -       /* p5 */
> +       /* _sP7_hi */
> +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> +
>         .align  64
> -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> -       /* p6 */
> +       /* _sSignMask */
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +
>         .align  64
> +       /* _sP6_lo */
>         .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
>         .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
>         .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
>         .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> +
> +       .align  64
> +       /* _sP6_hi */
>         .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
>         .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
>         .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
>         .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> -       /* p7 */
> +
>         .align  64
> -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> +       /* _sP5_lo */
> +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> +
>         .align  64
> -       .long   0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMantMask_UISA */
> +       /* _sP5_hi */
> +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> +
>         .align  64
> -       .long   0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000 /* _iMinIdxOfsMask_UISA */
> +       /* _sP4_lo */
> +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> +
>         .align  64
> -       .long   0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000 /* _iMaxIdxMask_UISA */
> +       /* _sP4_hi */
> +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> +
>         .align  64
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> +       /* _sP3_lo */
> +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> +
>         .align  64
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> +       /* _sP3_hi */
> +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> +
>         .align  64
> -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> +       /* _sP2_lo */
> +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> +
>         .align  64
> -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> +       /* _sP2_hi */
> +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> +
>         .align  64
> -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> +       /* _sP0_lo */
> +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> +
>         .align  64
> -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> +       /* _sP0_hi */
> +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> +
>         .align  64
> +       .type   __svml_stanh_data_internal_al64, @object
> +       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
>         .type   __svml_stanh_data_internal, @object
>         .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4
  2022-06-09  0:05   ` [PATCH v2 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
@ 2022-06-09 16:05     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 16:05 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 8, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> tanhf-avx2 and tanhf-sse4 use the same data tables so we can save
> over 4kb using a shared datatable. This does increase the memory
> footprint of the sse4 version (as now all the targets are 32 bytes
> instead of 16), generally it seems worth the code size save.
>
> NB: This patch doesn't do anything itself, it is setup for future
> patches.
> ---
>  .../fpu/multiarch/svml_s_tanhf_rodata.S       | 621 ++++++++++++++++++
>  1 file changed, 621 insertions(+)
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
> new file mode 100644
> index 0000000000..904fe5f588
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
> @@ -0,0 +1,621 @@
> +/* Datatables for  tanhf AVX2 and tanhf SSE4.
> +   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/* Offsets are ordered by use in the function. On cold-starts this
> +   might help the prefetcher. If the streaming prefetchers kick in it
> +   will prefetch into the lookup table.  */
> +#define _iExpMantMask                  0
> +#define _iMinIdxOfsMask                        32
> +#define _iMaxIdxMask                   64
> +#define _sAbsMask                      96
> +#define _iExpMask                      128
> +#define _lookupTable                   160
> +
> +#define TANHF_DATA(offset)             ((offset)+__svml_stanh_data_internal_avx2)
> +#ifndef ONLY_DECL_OFFSET
> +       .section .rodata, "a"
> +       .align  32
> +
> +# ifdef __svml_stanh_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct
> +       {
> +       __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> +       __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> +       __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> +       __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> +       __declspec(align(32)) VUINT32 _iExpMask[8][1];
> +       __declspec(align(32)) VUINT32 _lookupTable[(134*4)][2];
> +} __svml_stanh_data_internal;
> +# endif
> +
> +
> +__svml_stanh_data_internal:
> +       .globl  __svml_stanh_data_internal_avx2
> +__svml_stanh_data_internal_avx2:
> +       .align  32
> +       /* _iExpMantMask.  */
> +       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
> +       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
> +
> +       .align  32
> +       /* _iMinIdxOfsMask.  */
> +       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
> +       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
> +
> +       .align  32
> +       /* _iMaxIdxMask.  */
> +       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000
> +       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000
> +
> +       .align  32
> +       /* _sAbsMask.  */
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +
> +       .align  32
> +       /* _iExpMask.  */
> +       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
> +       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
> +
> +       .align  32
> +       /* _lookupTable.  */
> +       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500].  */
> +       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01.  */
> +       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00.  */
> +       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06.  */
> +       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01.  */
> +       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08.  */
> +       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00.  */
> +       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05.  */
> +       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01.  */
> +       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08.  */
> +       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00.  */
> +       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04.  */
> +       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01.  */
> +       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08.  */
> +       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00.  */
> +       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04.  */
> +       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01.  */
> +       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08.  */
> +       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00.  */
> +       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04.  */
> +       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01.  */
> +       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08.  */
> +       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00.  */
> +       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04.  */
> +       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01.  */
> +       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08.  */
> +       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00.  */
> +       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04.  */
> +       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01.  */
> +       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08.  */
> +       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00.  */
> +       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04.  */
> +       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01.  */
> +       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07.  */
> +       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00.  */
> +       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04.  */
> +       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01.  */
> +       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07.  */
> +       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00.  */
> +       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04.  */
> +       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01.  */
> +       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07.  */
> +       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00.  */
> +       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04.  */
> +       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01.  */
> +       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07.  */
> +       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00.  */
> +       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04.  */
> +       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01.  */
> +       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07.  */
> +       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00.  */
> +       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04.  */
> +       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01.  */
> +       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07.  */
> +       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00.  */
> +       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04.  */
> +       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01.  */
> +       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07.  */
> +       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00.  */
> +       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04.  */
> +       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01.  */
> +       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07.  */
> +       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00.  */
> +       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04.  */
> +       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01.  */
> +       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07.  */
> +       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00.  */
> +       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04.  */
> +       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01.  */
> +       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07.  */
> +       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00.  */
> +       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04.  */
> +       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01.  */
> +       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07.  */
> +       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00.  */
> +       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04.  */
> +       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01.  */
> +       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06.  */
> +       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00.  */
> +       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04.  */
> +       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01.  */
> +       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06.  */
> +       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00.  */
> +       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03.  */
> +       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01.  */
> +       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06.  */
> +       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00.  */
> +       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03.  */
> +       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01.  */
> +       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06.  */
> +       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00.  */
> +       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03.  */
> +       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01.  */
> +       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06.  */
> +       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00.  */
> +       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03.  */
> +       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01.  */
> +       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06.  */
> +       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00.  */
> +       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03.  */
> +       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01.  */
> +       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06.  */
> +       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00.  */
> +       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03.  */
> +       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01.  */
> +       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06.  */
> +       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00.  */
> +       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03.  */
> +       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01.  */
> +       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06.  */
> +       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00.  */
> +       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03.  */
> +       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01.  */
> +       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06.  */
> +       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00.  */
> +       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03.  */
> +       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01.  */
> +       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06.  */
> +       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00.  */
> +       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03.  */
> +       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01.  */
> +       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05.  */
> +       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00.  */
> +       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03.  */
> +       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01.  */
> +       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05.  */
> +       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00.  */
> +       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03.  */
> +       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01.  */
> +       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05.  */
> +       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00.  */
> +       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03.  */
> +       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01.  */
> +       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05.  */
> +       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00.  */
> +       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03.  */
> +       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01.  */
> +       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05.  */
> +       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00.  */
> +       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03.  */
> +       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01.  */
> +       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05.  */
> +       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00.  */
> +       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03.  */
> +       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01.  */
> +       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05.  */
> +       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00.  */
> +       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03.  */
> +       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01.  */
> +       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05.  */
> +       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00.  */
> +       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02.  */
> +       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01.  */
> +       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05.  */
> +       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00.  */
> +       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02.  */
> +       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01.  */
> +       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05.  */
> +       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00.  */
> +       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02.  */
> +       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01.  */
> +       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04.  */
> +       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00.  */
> +       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02.  */
> +       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01.  */
> +       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04.  */
> +       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00.  */
> +       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02.  */
> +       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01.  */
> +       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04.  */
> +       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00.  */
> +       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02.  */
> +       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01.  */
> +       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04.  */
> +       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00.  */
> +       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02.  */
> +       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01.  */
> +       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04.  */
> +       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00.  */
> +       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02.  */
> +       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01.  */
> +       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04.  */
> +       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00.  */
> +       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02.  */
> +       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01.  */
> +       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04.  */
> +       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00.  */
> +       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02.  */
> +       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01.  */
> +       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04.  */
> +       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00.  */
> +       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02.  */
> +       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01.  */
> +       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04.  */
> +       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00.  */
> +       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02.  */
> +       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01.  */
> +       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04.  */
> +       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00.  */
> +       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02.  */
> +       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01.  */
> +       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04.  */
> +       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00.  */
> +       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02.  */
> +       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01.  */
> +       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04.  */
> +       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00.  */
> +       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02.  */
> +       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01.  */
> +       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03.  */
> +       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00.  */
> +       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02.  */
> +       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01.  */
> +       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03.  */
> +       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00.  */
> +       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02.  */
> +       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01.  */
> +       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03.  */
> +       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00.  */
> +       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02.  */
> +       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01.  */
> +       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03.  */
> +       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00.  */
> +       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02.  */
> +       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01.  */
> +       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03.  */
> +       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00.  */
> +       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01.  */
> +       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01.  */
> +       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03.  */
> +       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00.  */
> +       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01.  */
> +       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01.  */
> +       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03.  */
> +       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00.  */
> +       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01.  */
> +       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01.  */
> +       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03.  */
> +       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00.  */
> +       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01.  */
> +       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01.  */
> +       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03.  */
> +       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00.  */
> +       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01.  */
> +       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01.  */
> +       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03.  */
> +       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00.  */
> +       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01.  */
> +       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01.  */
> +       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03.  */
> +       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00.  */
> +       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01.  */
> +       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01.  */
> +       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03.  */
> +       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00.  */
> +       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01.  */
> +       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01.  */
> +       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03.  */
> +       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00.  */
> +       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01.  */
> +       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02.  */
> +       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02.  */
> +       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00.  */
> +       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01.  */
> +       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02.  */
> +       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02.  */
> +       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00.  */
> +       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01.  */
> +       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02.  */
> +       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02.  */
> +       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00.  */
> +       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01.  */
> +       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02.  */
> +       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02.  */
> +       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00.  */
> +       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01.  */
> +       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02.  */
> +       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02.  */
> +       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00.  */
> +       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01.  */
> +       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03.  */
> +       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02.  */
> +       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00.  */
> +       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01.  */
> +       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03.  */
> +       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02.  */
> +       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00.  */
> +       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01.  */
> +       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02.  */
> +       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02.  */
> +       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00.  */
> +       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01.  */
> +       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02.  */
> +       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02.  */
> +       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00.  */
> +       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01.  */
> +       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02.  */
> +       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02.  */
> +       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00.  */
> +       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01.  */
> +       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02.  */
> +       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02.  */
> +       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00.  */
> +       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01.  */
> +       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02.  */
> +       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02.  */
> +       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00.  */
> +       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01.  */
> +       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02.  */
> +       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02.  */
> +       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00.  */
> +       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01.  */
> +       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02.  */
> +       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02.  */
> +       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00.  */
> +       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01.  */
> +       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02.  */
> +       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02.  */
> +       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00.  */
> +       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01.  */
> +       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02.  */
> +       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02.  */
> +       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00.  */
> +       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01.  */
> +       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01.  */
> +       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02.  */
> +       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00.  */
> +       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01.  */
> +       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01.  */
> +       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02.  */
> +       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00.  */
> +       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01.  */
> +       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01.  */
> +       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02.  */
> +       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00.  */
> +       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01.  */
> +       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01.  */
> +       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02.  */
> +       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00.  */
> +       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01.  */
> +       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01.  */
> +       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02.  */
> +       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00.  */
> +       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01.  */
> +       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01.  */
> +       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02.  */
> +       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00.  */
> +       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01.  */
> +       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01.  */
> +       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02.  */
> +       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00.  */
> +       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01.  */
> +       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02.  */
> +       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02.  */
> +       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00.  */
> +       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01.  */
> +       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02.  */
> +       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02.  */
> +       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00.  */
> +       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01.  */
> +       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02.  */
> +       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03.  */
> +       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00.  */
> +       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01.  */
> +       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02.  */
> +       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02.  */
> +       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00.  */
> +       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01.  */
> +       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02.  */
> +       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02.  */
> +       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00.  */
> +       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01.  */
> +       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02.  */
> +       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02.  */
> +       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00.  */
> +       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01.  */
> +       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02.  */
> +       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01.  */
> +       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01.  */
> +       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01.  */
> +       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02.  */
> +       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01.  */
> +       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01.  */
> +       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01.  */
> +       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02.  */
> +       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01.  */
> +       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01.  */
> +       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01.  */
> +       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02.  */
> +       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01.  */
> +       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01.  */
> +       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01.  */
> +       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02.  */
> +       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01.  */
> +       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01.  */
> +       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01.  */
> +       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02.  */
> +       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01.  */
> +       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01.  */
> +       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01.  */
> +       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02.  */
> +       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01.  */
> +       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01.  */
> +       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01.  */
> +       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02.  */
> +       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01.  */
> +       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01.  */
> +       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01.  */
> +       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02.  */
> +       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01.  */
> +       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01.  */
> +       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01.  */
> +       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02.  */
> +       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01.  */
> +       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01.  */
> +       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02.  */
> +       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03.  */
> +       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01.  */
> +       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01.  */
> +       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02.  */
> +       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03.  */
> +       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01.  */
> +       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01.  */
> +       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02.  */
> +       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03.  */
> +       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01.  */
> +       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01.  */
> +       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02.  */
> +       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03.  */
> +       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01.  */
> +       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01.  */
> +       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02.  */
> +       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03.  */
> +       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01.  */
> +       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01.  */
> +       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02.  */
> +       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03.  */
> +       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01.  */
> +       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01.  */
> +       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02.  */
> +       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03.  */
> +       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01.  */
> +       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02.  */
> +       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02.  */
> +       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03.  */
> +       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01.  */
> +       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02.  */
> +       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02.  */
> +       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03.  */
> +       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01.  */
> +       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02.  */
> +       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02.  */
> +       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03.  */
> +       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01.  */
> +       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02.  */
> +       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03.  */
> +       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04.  */
> +       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01.  */
> +       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02.  */
> +       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03.  */
> +       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04.  */
> +       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01.  */
> +       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02.  */
> +       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03.  */
> +       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04.  */
> +       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01.  */
> +       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02.  */
> +       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03.  */
> +       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04.  */
> +       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01.  */
> +       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03.  */
> +       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03.  */
> +       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05.  */
> +       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01.  */
> +       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03.  */
> +       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03.  */
> +       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05.  */
> +       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01.  */
> +       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03.  */
> +       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04.  */
> +       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05.  */
> +       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01.  */
> +       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03.  */
> +       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04.  */
> +       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05.  */
> +       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01.  */
> +       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03.  */
> +       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04.  */
> +       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05.  */
> +       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01.  */
> +       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03.  */
> +       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04.  */
> +       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06.  */
> +       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01.  */
> +       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04.  */
> +       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04.  */
> +       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06.  */
> +       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01.  */
> +       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04.  */
> +       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05.  */
> +       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06.  */
> +       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01.  */
> +       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04.  */
> +       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05.  */
> +       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06.  */
> +       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01.  */
> +       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04.  */
> +       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05.  */
> +       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06.  */
> +       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01.  */
> +       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04.  */
> +       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05.  */
> +       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07.  */
> +       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01.  */
> +       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05.  */
> +       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06.  */
> +       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07.  */
> +       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01.  */
> +       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05.  */
> +       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06.  */
> +       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07.  */
> +       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01.  */
> +       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05.  */
> +       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06.  */
> +       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08.  */
> +       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01.  */
> +       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06.  */
> +       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07.  */
> +       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08.  */
> +       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01.  */
> +       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06.  */
> +       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07.  */
> +       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09.  */
> +       .quad   0x3ff0000000000000
> +       .quad   0x0000000000000000
> +       .quad   0x0000000000000000
> +       .quad   0x0000000000000000
> +
> +       .align  32
> +       .type   __svml_stanh_data_internal_avx2, @object
> +       .size   __svml_stanh_data_internal_avx2, .-__svml_stanh_data_internal_avx2
> +       .type   __svml_stanh_data_internal, @object
> +       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +#endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S
  2022-06-09  0:05   ` [PATCH v2 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 16:10     ` H.J. Lu
  2022-06-09 16:58       ` Noah Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 16:10 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 8, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Reduce code size (-81 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Reduce rodata size (-32 bytes).
>
> Result is roughly a 17-18% speedup:
>
>        Function, New Time, Old Time, New / Old
> _ZGVdN8v_tanhf,     1.977,    2.402,     0.823
> ---
>  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 912 ++++--------------
>  1 file changed, 171 insertions(+), 741 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> index c5c87bf5b0..a47ede0501 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> @@ -70,773 +70,203 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _dbP                           0
> -#define _sSignMask                     4288
> -#define _sAbsMask                      4320
> -#define _iExpMantMask                  4352
> -#define _iExpMask                      4384
> -#define _iMinIdxOfsMask                        4416
> -#define _iMaxIdxMask                   4448
> -
>  #include <sysdep.h>
>
> +/* tanhf data tables for avx2 and sse4 implementatins defined here.
> + */
> +#include "svml_s_tanhf_rodata.S"
> +
>         .section .text.avx2, "ax", @progbits
>  ENTRY(_ZGVdN8v_tanhf_avx2)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-32, %rsp
> -       pushq   %r12
> -       subq    $120, %rsp
> -       lea     _dbP+16+__svml_stanh_data_internal(%rip), %r10
> -       vmovaps %ymm0, %ymm12
> -
>         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       vpand   _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
> +
> +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> +       vpxor   %ymm3, %ymm3, %ymm3
> +       vpmaxsd %ymm3, %ymm2, %ymm2
> +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
>
>         /*
>          *  small table specific variables *
>          *  Constant loading
>          */
> -       vmovups _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> -       vpsubd  _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> -
> -       /* if VMIN, VMAX is defined for I type */
> -       vxorps  %ymm15, %ymm15, %ymm15
> -       vpcmpgtd %ymm15, %ymm9, %ymm0
> -       vpand   %ymm0, %ymm9, %ymm7
> -       vpcmpgtd %ymm8, %ymm9, %ymm6
> -       vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> -       vpsrld  $14, %ymm3, %ymm1
> -       vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> -       vmovmskps %ymm13, %r11d
> -       vandps  _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> -       vandps  _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> -       vextractf128 $1, %ymm1, %xmm2
> -       vmovd   %xmm1, %r9d
> -       vmovd   %xmm2, %ecx
> -       vpextrd $1, %xmm2, %edx
> -       vpextrd $1, %xmm1, %r8d
> -       movslq  %r9d, %r9
> -       movslq  %edx, %rdx
> -       movslq  %r8d, %r8
> -       vpextrd $2, %xmm1, %edi
> -       movslq  %ecx, %rcx
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> -       vpextrd $3, %xmm2, %r12d
> -       vpextrd $3, %xmm1, %esi
> -       vpextrd $2, %xmm2, %eax
> -       movslq  %edi, %rdi
> -       movslq  %r12d, %r12
> -       movslq  %esi, %rsi
> -       movslq  %eax, %rax
> -       vmovupd -16(%r9, %r10), %xmm5
> -       vmovupd -16(%rdx, %r10), %xmm14
> -       vmovupd -16(%rcx, %r10), %xmm13
> -       vmovupd (%r9, %r10), %xmm1
> -       vmovupd (%r8, %r10), %xmm2
> -       vmovupd -16(%r8, %r10), %xmm4
> -       vinsertf128 $1, -16(%rdi, %r10), %ymm5, %ymm15
> -       vinsertf128 $1, -16(%r12, %r10), %ymm14, %ymm3
> -       vinsertf128 $1, -16(%rax, %r10), %ymm13, %ymm6
> -       vinsertf128 $1, (%rdi, %r10), %ymm1, %ymm5
> -       vinsertf128 $1, (%rsi, %r10), %ymm2, %ymm14
> -       vunpcklpd %ymm3, %ymm6, %ymm8
> +       vpsrld  $14, %ymm2, %ymm1
> +
> +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> +          store/load as we can take advantage of store-forwarding.  */
> +       vmovq   %xmm1, %r8
> +       /* We have eliminated all negative values for ymm1 so no need to sign
> +          extend.  */
> +       movl    %r8d, %r9d
> +       shrq    $32, %r8
> +
> +       /* Store base of lookup table in rax.  */
> +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> +
> +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> +          with memory operand. This helps alleviate bottleneck on p5.  */
> +       vmovupd 16(%r9, %rax), %xmm5
> +
> +       vpextrq $1, %xmm1, %rsi
> +       movl    %esi, %edi
> +       shrq    $32, %rsi
> +
> +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
> +
> +       vextracti128 $1, %ymm1, %xmm2
> +       vmovq   %xmm2, %rdx
> +       movl    %edx, %ecx
> +       shrq    $32, %rdx
> +
> +       vmovupd (%rcx, %rax), %xmm6
> +
> +       vpextrq $1, %xmm2, %r10
> +       movl    %r10d, %r11d
> +       shrq    $32, %r10
> +
> +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> +
> +       vmovupd 16(%r8, %rax), %xmm1
> +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> +       vmovupd (%rdx, %rax), %xmm3
> +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> +
> +       vunpcklpd %ymm3, %ymm6, %ymm7
>         vunpckhpd %ymm3, %ymm6, %ymm6
> -       vunpcklpd %ymm14, %ymm5, %ymm3
> -       vunpckhpd %ymm14, %ymm5, %ymm2
> -       vmovupd (%rcx, %r10), %xmm13
> -       vcvtps2pd %xmm10, %ymm5
> -       vextractf128 $1, %ymm10, %xmm10
> -       vfmadd213pd %ymm3, %ymm5, %ymm2
> -       vinsertf128 $1, -16(%rsi, %r10), %ymm4, %ymm0
> -       vmovupd (%rdx, %r10), %xmm4
> -       vunpcklpd %ymm0, %ymm15, %ymm9
> -       vunpckhpd %ymm0, %ymm15, %ymm7
> -       vfmadd213pd %ymm7, %ymm5, %ymm2
> -       vfmadd213pd %ymm9, %ymm5, %ymm2
> -       vinsertf128 $1, (%r12, %r10), %ymm4, %ymm0
> -       vcvtps2pd %xmm10, %ymm4
> -       vinsertf128 $1, (%rax, %r10), %ymm13, %ymm15
> -       vunpcklpd %ymm0, %ymm15, %ymm1
> -       vunpckhpd %ymm0, %ymm15, %ymm0
> -       vfmadd213pd %ymm1, %ymm4, %ymm0
> -       vcvtpd2ps %ymm2, %xmm1
> -       vfmadd213pd %ymm6, %ymm4, %ymm0
> -       vfmadd213pd %ymm8, %ymm4, %ymm0
> -       vcvtpd2ps %ymm0, %xmm0
> -       vinsertf128 $1, %xmm0, %ymm1, %ymm2
> -       vorps   %ymm11, %ymm2, %ymm0
> -       testl   %r11d, %r11d
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> +       vunpcklpd %ymm1, %ymm5, %ymm3
> +       vunpckhpd %ymm1, %ymm5, %ymm1
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> +       /* Store special cases in ymm15.  */
> +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
>
> -L(EXIT):
> -       addq    $120, %rsp
> -       cfi_restore(12)
> -       popq    %r12
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> -       ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> +       vandps  %ymm11, %ymm0, %ymm4
>
> -       /* Branch to process
> -        * special inputs
> -        */
> +       vcvtps2pd %xmm4, %ymm5
>
> -L(SPECIAL_VALUES_BRANCH):
> -       vmovups %ymm12, 32(%rsp)
> -       vmovups %ymm0, 64(%rsp)
> -       # LOE rbx r13 r14 r15 r11d ymm0
> +       vextractf128 $1, %ymm4, %xmm4
> +       vcvtps2pd %xmm4, %ymm4
>
> -       xorl    %r12d, %r12d
> -       # LOE rbx r13 r14 r15 r11d r12d
> +       vmovupd 16(%rcx, %rax), %xmm2
> +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
>
> -       vzeroupper
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> -       movl    %r11d, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> +       vfmadd213pd %ymm3, %ymm5, %ymm1
> +
> +       vmovupd 16(%rdx, %rax), %xmm3
> +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> +
> +       vunpcklpd %ymm3, %ymm2, %ymm10
> +       vunpckhpd %ymm3, %ymm2, %ymm2
> +
> +       vfmadd213pd %ymm10, %ymm4, %ymm2
> +       vfmadd213pd %ymm6, %ymm4, %ymm2
> +       vfmadd213pd %ymm7, %ymm4, %ymm2
> +       vcvtpd2ps %ymm2, %xmm2
> +
> +       vmovupd (%r9, %rax), %xmm7
> +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> +
> +       vmovupd (%r8, %rax), %xmm3
> +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> +
> +       vunpckhpd %ymm3, %ymm7, %ymm4
> +       vunpcklpd %ymm3, %ymm7, %ymm7
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       vfmadd213pd %ymm4, %ymm5, %ymm1
> +       vfmadd213pd %ymm7, %ymm5, %ymm1
> +
> +
> +       vcvtpd2ps %ymm1, %xmm1
> +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> +
> +       vmovmskps %ymm15, %edx
> +       vandnps %ymm0, %ymm11, %ymm2
> +       testl   %edx, %edx
> +       /* Go to special inputs processing branch */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
> +       /* Wait until after branch of write over ymm0.  */
> +       vorps   %ymm2, %ymm1, %ymm0
> +       /* No stack restoration on the fastpath.  */
> +       ret
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
>
> -       /* Special inputs
> -        * processing loop
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a tanhf call. Optimize for code size
> +          moreso than speed here. */
               more so
> +L(SPECIAL_VALUES_BRANCH):
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
> +
> +       /* Align stack and make room for 2x ymm vectors.  */
> +       andq    $-32, %rsp
> +       addq    $-64, %rsp
> +
> +       /* Save all already computed inputs.  */
> +       vorps   %ymm2, %ymm1, %ymm1
> +       vmovaps %ymm1, (%rsp)
> +       /* Save origional input (ymm0 unchanged up to this point).  */
                         original
> +       vmovaps %ymm0, 32(%rsp)
> +
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $8, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 64(%rsp), %ymm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r13 r14 r15 ymm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
>
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   32(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> -END(_ZGVdN8v_tanhf_avx2)
>
> -       .section .rodata, "a"
> -       .align  32
> -
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> -       __declspec(align(32)) VUINT32 _sSignMask[8][1];
> -       __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> -       __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> -       __declspec(align(32)) VUINT32 _iExpMask[8][1];
> -       __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> -       __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> -} __svml_stanh_data_internal;
> -#endif
> -__svml_stanh_data_internal:
> -       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> -       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
> -       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
> -       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
> -       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
> -       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
> -       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
> -       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
> -       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
> -       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
> -       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
> -       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
> -       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
> -       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
> -       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
> -       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
> -       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
> -       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
> -       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
> -       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
> -       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
> -       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
> -       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
> -       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
> -       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
> -       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
> -       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
> -       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
> -       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
> -       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
> -       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
> -       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
> -       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
> -       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
> -       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
> -       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
> -       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
> -       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
> -       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
> -       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
> -       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
> -       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
> -       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
> -       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
> -       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
> -       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
> -       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
> -       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
> -       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
> -       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
> -       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
> -       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
> -       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
> -       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
> -       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
> -       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
> -       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
> -       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
> -       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
> -       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
> -       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
> -       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
> -       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
> -       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
> -       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
> -       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
> -       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
> -       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
> -       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
> -       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
> -       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
> -       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
> -       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
> -       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
> -       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
> -       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
> -       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
> -       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
> -       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
> -       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
> -       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
> -       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
> -       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
> -       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
> -       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
> -       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
> -       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
> -       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
> -       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
> -       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
> -       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
> -       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
> -       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
> -       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
> -       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
> -       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
> -       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
> -       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
> -       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
> -       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
> -       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
> -       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
> -       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
> -       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
> -       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
> -       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
> -       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
> -       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
> -       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
> -       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
> -       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
> -       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
> -       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
> -       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
> -       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
> -       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
> -       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
> -       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
> -       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
> -       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
> -       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
> -       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
> -       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
> -       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
> -       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
> -       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
> -       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
> -       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
> -       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
> -       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
> -       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
> -       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
> -       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
> -       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
> -       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
> -       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
> -       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
> -       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
> -       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
> -       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
> -       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
> -       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
> -       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
> -       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
> -       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
> -       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
> -       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
> -       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
> -       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
> -       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
> -       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
> -       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
> -       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
> -       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
> -       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
> -       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
> -       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
> -       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
> -       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
> -       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
> -       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
> -       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
> -       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
> -       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
> -       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
> -       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
> -       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
> -       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
> -       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
> -       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
> -       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
> -       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
> -       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
> -       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
> -       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
> -       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
> -       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
> -       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
> -       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
> -       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
> -       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
> -       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
> -       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
> -       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
> -       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
> -       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
> -       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
> -       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
> -       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
> -       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
> -       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
> -       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
> -       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
> -       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
> -       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
> -       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
> -       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
> -       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
> -       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
> -       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
> -       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
> -       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
> -       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
> -       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
> -       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
> -       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
> -       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
> -       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
> -       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
> -       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
> -       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
> -       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
> -       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
> -       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
> -       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
> -       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
> -       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
> -       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
> -       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
> -       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
> -       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
> -       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
> -       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
> -       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
> -       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
> -       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
> -       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
> -       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
> -       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
> -       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
> -       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
> -       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
> -       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
> -       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
> -       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
> -       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
> -       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
> -       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
> -       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
> -       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
> -       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
> -       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
> -       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
> -       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
> -       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
> -       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
> -       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
> -       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
> -       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
> -       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
> -       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
> -       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
> -       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
> -       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
> -       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
> -       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
> -       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
> -       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
> -       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
> -       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
> -       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
> -       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
> -       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
> -       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
> -       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
> -       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
> -       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
> -       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
> -       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
> -       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
> -       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
> -       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
> -       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
> -       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
> -       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
> -       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
> -       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
> -       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
> -       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
> -       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
> -       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
> -       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
> -       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
> -       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
> -       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
> -       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
> -       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
> -       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
> -       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
> -       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
> -       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
> -       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
> -       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
> -       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
> -       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
> -       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
> -       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
> -       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
> -       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
> -       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
> -       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
> -       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
> -       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
> -       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
> -       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
> -       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
> -       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
> -       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
> -       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
> -       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
> -       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
> -       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
> -       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
> -       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
> -       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
> -       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
> -       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
> -       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
> -       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
> -       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
> -       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
> -       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
> -       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
> -       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
> -       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
> -       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
> -       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
> -       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
> -       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
> -       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
> -       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
> -       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
> -       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
> -       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
> -       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
> -       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
> -       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
> -       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
> -       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
> -       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
> -       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
> -       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
> -       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
> -       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
> -       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
> -       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
> -       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
> -       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
> -       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
> -       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
> -       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
> -       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
> -       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
> -       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
> -       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
> -       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
> -       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
> -       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
> -       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
> -       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
> -       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
> -       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
> -       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
> -       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
> -       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
> -       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
> -       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
> -       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
> -       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
> -       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
> -       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
> -       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
> -       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
> -       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
> -       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
> -       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
> -       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
> -       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
> -       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
> -       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
> -       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
> -       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
> -       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
> -       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
> -       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
> -       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
> -       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
> -       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
> -       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
> -       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
> -       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
> -       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
> -       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
> -       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
> -       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
> -       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
> -       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
> -       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
> -       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
> -       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
> -       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
> -       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
> -       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
> -       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
> -       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
> -       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
> -       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
> -       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
> -       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
> -       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
> -       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
> -       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
> -       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
> -       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
> -       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
> -       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
> -       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
> -       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
> -       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
> -       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
> -       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
> -       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
> -       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
> -       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
> -       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
> -       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
> -       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
> -       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
> -       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
> -       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
> -       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
> -       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
> -       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
> -       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
> -       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
> -       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
> -       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
> -       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
> -       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
> -       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
> -       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
> -       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
> -       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
> -       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
> -       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
> -       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
> -       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
> -       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
> -       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
> -       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
> -       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
> -       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
> -       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
> -       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
> -       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
> -       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
> -       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
> -       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
> -       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
> -       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
> -       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
> -       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
> -       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
> -       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
> -       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
> -       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
> -       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
> -       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
> -       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
> -       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
> -       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
> -       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
> -       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
> -       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
> -       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
> -       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
> -       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
> -       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
> -       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
> -       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
> -       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
> -       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
> -       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
> -       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
> -       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
> -       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
> -       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
> -       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
> -       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
> -       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
> -       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
> -       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
> -       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
> -       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
> -       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
> -       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
> -       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
> -       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
> -       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
> -       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
> -       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
> -       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
> -       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
> -       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
> -       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
> -       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
> -       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
> -       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
> -       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
> -       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
> -       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
> -       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
> -       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
> -       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
> -       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
> -       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
> -       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
> -       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
> -       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
> -       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
> -       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
> -       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
> -       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
> -       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
> -       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
> -       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
> -       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
> -       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
> -       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
> -       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
> -       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
> -       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
> -       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
> -       .quad   0x3ff0000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .align  32
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> -       .align  32
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> -       .align  32
> -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> -       .align  32
> -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> -       .align  32
> -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> -       .align  32
> -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> -       .align  32
> -       .type   __svml_stanh_data_internal, @object
> -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +       /* All results have been written to 32(%rsp).  */
                                                                Should
32 be removed?
> +       vmovups (%rsp), %ymm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
> +END(_ZGVdN8v_tanhf_avx2)
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-09 15:59   ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
@ 2022-06-09 16:56     ` Noah Goldstein
  2022-06-09 16:57       ` H.J. Lu
  0 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:56 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:00 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Improvementss are:
> >     1. Reduce code size (-64 bytes).
> >     2. Remove redundant move instructions.
> >     3. Slightly improve instruction selection/scheduling where
> >        possible.
> >     4. Reduce rodata size ([-128, -188] bytes).
> >
> > The throughput improvement is not significant as the port 0 bottleneck
> > is unavoidable.
> >         Function, New Time, Old Time, New / Old
> > _ZGVeN16v_atanhf,     1.39,    1.408,     0.987
> > ---
> >  .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
> >  1 file changed, 244 insertions(+), 230 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > index a1cd920a0f..3d808ac2bd 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > @@ -31,53 +31,50 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_satanh_data_internal_avx512
> > - */
> > -#define Log_tbl_H                      0
> > -#define Log_tbl_L                      128
> > -#define One                            256
> > -#define AbsMask                                320
> > -#define AddB5                          384
> > -#define RcpBitMask                     448
> > -#define poly_coeff3                    512
> > -#define poly_coeff2                    576
> > -#define poly_coeff1                    640
> > -#define poly_coeff0                    704
> > -#define Half                           768
> > -#define L2H                            832
> > -#define L2L                            896
> > +/* Offsets for data table __svml_satanh_data_internal_avx512 and
> > +   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
> > +   function. On cold-starts this might help the prefetcher. Possibly
> > +   a better idea is to interleave start/end so that the prefetcher is
> > +   less likely to detect a stream and pull irrelivant lines into
> > +   cache.  */
> > +
> > +/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
> > +   the memory is broadcast to {1to16}.  */
> > +#define AbsMask                                0
> > +
> > +/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
> > +   is used here.  */
> > +#define One                            0
> > +#define AddB5                          64
> > +#define RcpBitMask                     128
> > +#define Log_tbl_L_lo                   192
> > +#define Log_tbl_L_hi                   256
> > +#define Log_tbl_H_lo                   320
> > +#define Log_tbl_H_hi                   384
> > +#define L2H                            448
> > +#define L2L                            512
> > +#define poly_coeff3                    576
> > +#define poly_coeff2                    640
> > +#define poly_coeff1                    704
> >
> >  #include <sysdep.h>
> >
> > +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal_avx512_al64)
> > +
> >         .section .text.exex512, "ax", @progbits
> >  ENTRY(_ZGVeN16v_atanhf_skx)
> > -       pushq   %rbp
> > -       cfi_def_cfa_offset(16)
> > -       movq    %rsp, %rbp
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> > -       andq    $-64, %rsp
> > -       subq    $192, %rsp
> > -       vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> > -
> > -       /* round reciprocals to 1+5b mantissas */
> > -       vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> > -       vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> > -       vmovaps %zmm0, %zmm11
> > -       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > +       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
> > +       vmovups ATANHF_DATA(One)(%rip), %zmm4
> >
> >         /* 1+y */
> >         vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
> >
> >         /* 1-y */
> >         vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> > -       vxorps  %zmm6, %zmm11, %zmm10
> > -
> > -       /* Yp_high */
> > -       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> >
> > -       /* -Ym_high */
> > -       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> > +       /* round reciprocals to 1+5b mantissas */
> > +       vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
> > +       vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
> >
> >         /* RcpP ~ 1/Yp */
> >         vrcp14ps %zmm9, %zmm12
> > @@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
> >         /* RcpM ~ 1/Ym */
> >         vrcp14ps %zmm8, %zmm13
> >
> > +       /* Yp_high */
> > +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> > +
> > +       /* -Ym_high */
> > +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> > +
> > +
> >         /* input outside (-1, 1) ? */
> > -       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> >         vpaddd  %zmm14, %zmm12, %zmm15
> > -       vpaddd  %zmm14, %zmm13, %zmm0
> > +       vpaddd  %zmm14, %zmm13, %zmm12
> >
> >         /* Yp_low */
> >         vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
> >         vandps  %zmm1, %zmm15, %zmm7
> > -       vandps  %zmm1, %zmm0, %zmm12
> > +       vandps  %zmm1, %zmm12, %zmm12
> >
> >         /* Ym_low */
> >         vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> > @@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
> >         vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> >
> >         /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> > -       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > -       vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> > -       vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > +       vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
> > +
> > +       vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
> > +       vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
> >
> >         /* exponents */
> > -       vgetexpps {sae}, %zmm7, %zmm15
> >         vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > +       vgetexpps {sae}, %zmm7, %zmm15
> > +
> >
> >         /* Table lookups */
> > -       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> > +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
> >         vgetexpps {sae}, %zmm12, %zmm14
> > -       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > +
> >
> >         /* Prepare table index */
> >         vpsrld  $18, %zmm7, %zmm3
> >         vpsrld  $18, %zmm12, %zmm2
> > -       vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > -       vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> > -
> > +       vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
> > +       vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
> >         /* Km-Kp */
> > +
> > +       vmovaps %zmm3, %zmm5
> > +       vpermi2ps %zmm13, %zmm10, %zmm3
> > +       vpermt2ps %zmm13, %zmm2, %zmm10
> > +       vpermi2ps %zmm7, %zmm11, %zmm5
> > +       vpermt2ps %zmm7, %zmm2, %zmm11
> >         vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> > -       kmovw   %k0, %edx
> > -       vmovaps %zmm3, %zmm0
> > -       vpermi2ps %zmm13, %zmm8, %zmm3
> > -       vpermt2ps %zmm13, %zmm2, %zmm8
> > -       vpermi2ps %zmm7, %zmm6, %zmm0
> > -       vpermt2ps %zmm7, %zmm2, %zmm6
> > -       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> > +       vsubps  {rn-sae}, %zmm3, %zmm10, %zmm7
> >
> >         /* K*L2H + Th */
> > -       vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> > +       vmovups ATANHF_DATA(L2H)(%rip), %zmm2
> >
> >         /* K*L2L + Tl */
> > -       vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > -
> > -       /* polynomials */
> > -       vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > -       vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > +       vmovups ATANHF_DATA(L2L)(%rip), %zmm3
> >
> >         /* table values */
> > -       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> > -       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > -       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > -       vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > -       vmovaps %zmm3, %zmm2
> > -       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > -       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > -       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > -       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > -       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > -       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > +       vsubps  {rn-sae}, %zmm5, %zmm11, %zmm5
> > +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
> > +       vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
> > +       /* polynomials */
> > +       vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
> > +       vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
> > +       vmovaps %zmm10, %zmm14
> > +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
> > +       vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
> > +       vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
> > +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
> > +       vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
> > +       vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
> > +       vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
> >
> >         /* (K*L2L + Tl) + Rp*PolyP */
> > -       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > -       vorps   Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
> > +
> > +       /* zmm12 = zmm12 & (zmm4 | zmm0).  */
> > +       vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
> >
> >         /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> > -       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > -       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> > -       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> > +       vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
> > +       vaddps  {rn-sae}, %zmm14, %zmm10, %zmm8
> > +
> > +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> > +       kmovw   %k0, %edx
> >         testl   %edx, %edx
> >
> >         /* Go to special inputs processing branch */
> >         jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> > +       # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
> > +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm0
> >
> > -       /* Restore registers
> > -        * and exit the function
> > -        */
> > -
> > -L(EXIT):
> > -       movq    %rbp, %rsp
> > -       popq    %rbp
> > -       cfi_def_cfa(7, 8)
> > -       cfi_restore(6)
> > +       /* No register to restore on fast path.  */
> >         ret
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> > -
> > -       /* Branch to process
> > -        * special inputs
> > -        */
> >
> > +       /* Cold case. edx has 1s where there was a special value that
> > +          needs to be handled by a atanhf call. Optimize for code size
> > +          moreso than speed here. */
> >  L(SPECIAL_VALUES_BRANCH):
> > -       vmovups %zmm11, 64(%rsp)
> > -       vmovups %zmm0, 128(%rsp)
> > -       # LOE rbx r12 r13 r14 r15 edx zmm0
> > -
> > -       xorl    %eax, %eax
> > -       # LOE rbx r12 r13 r14 r15 eax edx
> > -
> > -       vzeroupper
> > -       movq    %r12, 16(%rsp)
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -       movl    %eax, %r12d
> > -       movq    %r13, 8(%rsp)
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -       movl    %edx, %r13d
> > -       movq    %r14, (%rsp)
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       /* Range mask
> > -        * bits check
> > +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
> > +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > +       callee save register saving code size. */
> > +       pushq   %r13
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(r13, -16)
> > +       /* Need to callee save registers to preserve state across tanhf calls.
> >          */
> > +       pushq   %rbx
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbx, -24)
> > +       pushq   %rbp
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbp, -32)
> > +       movq    %rsp, %r13
> > +       cfi_def_cfa_register(r13)
> >
> > -L(RANGEMASK_CHECK):
> > -       btl     %r12d, %r13d
> > -
> > -       /* Call scalar math function */
> > -       jc      L(SCALAR_MATH_CALL)
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       /* Special inputs
> > -        * processing loop
> > -        */
> > +       /* Align stack and make room for 2x zmm vectors.  */
> > +       andq    $-64, %rsp
> > +       addq    $-128, %rsp
> > +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm1
> > +       vmovaps %zmm1, (%rsp)
> > +       vmovaps %zmm0, 64(%rsp)
> > +       vzeroupper
> >
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a atanhf call.  */
> > +       movl    %edx, %ebx
> >  L(SPECIAL_VALUES_LOOP):
> > -       incl    %r12d
> > -       cmpl    $16, %r12d
> > -
> > -       /* Check bits in range mask */
> > -       jl      L(RANGEMASK_CHECK)
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       movq    16(%rsp), %r12
> > -       cfi_restore(12)
> > -       movq    8(%rsp), %r13
> > -       cfi_restore(13)
> > -       movq    (%rsp), %r14
> > -       cfi_restore(14)
> > -       vmovups 128(%rsp), %zmm0
> > -
> > -       /* Go to exit */
> > -       jmp     L(EXIT)
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r12 r13 r14 r15 zmm0
> > -
> > -       /* Scalar math fucntion call
> > -        * to process special input
> > -        */
> > -
> > -L(SCALAR_MATH_CALL):
> > -       movl    %r12d, %r14d
> > -       movss   64(%rsp, %r14, 4), %xmm0
> > +       # LOE rbx rbp r12 r13 r14 r15
> > +       /* use rbp as index for special value that is saved across calls to
> > +          atanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop. Realigning also costs more code size.  */
> > +       xorl    %ebp, %ebp
> > +       tzcntl  %ebx, %ebp
> > +
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   64(%rsp, %rbp, 4), %xmm0
> >         call    atanhf@PLT
> > -       # LOE rbx r14 r15 r12d r13d xmm0
> > -
> > -       movss   %xmm0, 128(%rsp, %r14, 4)
> >
> > -       /* Process special inputs in loop */
> > -       jmp     L(SPECIAL_VALUES_LOOP)
> > -       # LOE rbx r15 r12d r13d
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, (%rsp, %rbp, 4)
> > +
> > +       blsrl   %ebx, %ebx
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +       # LOE r12 r13 r14 r15
> > +
> > +       /* All results have been written to 64(%rsp).  */
>
> The return value is loaded from (%rsp).   Should all results be
> written to (%rsp)?

Correct. Fixed in V2.
>
> > +       vmovaps (%rsp), %zmm0
> > +       /* Restore rsp.  */
> > +       movq    %r13, %rsp
> > +       cfi_def_cfa_register(rsp)
> > +       /* Restore callee save registers.  */
> > +       popq    %rbp
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %rbx
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %r13
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(r13)
> > +       ret
> >  END(_ZGVeN16v_atanhf_skx)
> >
> >         .section .rodata, "a"
> > -       .align  64
> > -
> > +       .align  4
> >  #ifdef __svml_satanh_data_internal_avx512_typedef
> >  typedef unsigned int VUINT32;
> > -typedef struct {
> > -       __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> > -       __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> > +typedef struct{
> > +       __declspec(align(4)) VUINT32 AbsMask[1][1];
> >         __declspec(align(64)) VUINT32 One[16][1];
> > -       __declspec(align(64)) VUINT32 AbsMask[16][1];
> >         __declspec(align(64)) VUINT32 AddB5[16][1];
> >         __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> > +       __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
> > +       __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
> > +       __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
> > +       __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
> > +       __declspec(align(64)) VUINT32 L2H[16][1];
> > +       __declspec(align(64)) VUINT32 L2L[16][1];
> >         __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> >         __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> >         __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> > -       __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> > -       __declspec(align(64)) VUINT32 Half[16][1];
> > -       __declspec(align(64)) VUINT32 L2H[16][1];
> > -       __declspec(align(64)) VUINT32 L2L[16][1];
> >  } __svml_satanh_data_internal_avx512;
> >  #endif
> >  __svml_satanh_data_internal_avx512:
> > -       /* Log_tbl_H */
> > -       .long   0x00000000
> > -       .long   0x3cfc0000
> > -       .long   0x3d780000
> > -       .long   0x3db78000
> > -       .long   0x3df10000
> > -       .long   0x3e14c000
> > -       .long   0x3e300000
> > -       .long   0x3e4a8000
> > -       .long   0x3e648000
> > -       .long   0x3e7dc000
> > -       .long   0x3e8b4000
> > -       .long   0x3e974000
> > -       .long   0x3ea30000
> > -       .long   0x3eae8000
> > -       .long   0x3eb9c000
> > -       .long   0x3ec4e000
> > -       .long   0x3ecfa000
> > -       .long   0x3eda2000
> > -       .long   0x3ee48000
> > -       .long   0x3eeea000
> > -       .long   0x3ef8a000
> > -       .long   0x3f013000
> > -       .long   0x3f05f000
> > -       .long   0x3f0aa000
> > -       .long   0x3f0f4000
> > -       .long   0x3f13d000
> > -       .long   0x3f184000
> > -       .long   0x3f1ca000
> > -       .long   0x3f20f000
> > -       .long   0x3f252000
> > -       .long   0x3f295000
> > -       .long   0x3f2d7000
> > -       /* Log_tbl_L */
> > +       /* Leave this at front so we can potentially save space due to
> > +          smaller alignment constraint.  */
> > +       .align  4
> > +    /* AbsMask */
> > +       .long   0x7fffffff
> > +       .align  64
> > +__svml_satanh_data_internal_avx512_al64:
> > +       /* One */
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       /* AddB5 */
> > +       .align  64
> > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > +       /* RcpBitMask */
> > +       .align  64
> > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > +       /* Log_tbl_L_lo */
> >         .align  64
> >         .long   0x00000000
> >         .long   0x3726c39e
> > @@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
> >         .long   0x38dedfac
> >         .long   0x38ebfb5e
> >         .long   0xb8e63c9f
> > +       /* Log_tbl_L_hi */
> > +       .align  64
> >         .long   0xb85c1340
> >         .long   0x38777bcd
> >         .long   0xb6038656
> > @@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
> >         .long   0x38f85db0
> >         .long   0x37b4996f
> >         .long   0xb8bfb3ca
> > -       /* One */
> > +       /* Log_tbl_H_lo */
> >         .align  64
> > -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > -       /* AbsMask */
> > +       .long   0x00000000
> > +       .long   0x3cfc0000
> > +       .long   0x3d780000
> > +       .long   0x3db78000
> > +       .long   0x3df10000
> > +       .long   0x3e14c000
> > +       .long   0x3e300000
> > +       .long   0x3e4a8000
> > +       .long   0x3e648000
> > +       .long   0x3e7dc000
> > +       .long   0x3e8b4000
> > +       .long   0x3e974000
> > +       .long   0x3ea30000
> > +       .long   0x3eae8000
> > +       .long   0x3eb9c000
> > +       .long   0x3ec4e000
> > +       /* Log_tbl_H_hi */
> >         .align  64
> > -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > -       /* AddB5 */
> > +       .long   0x3ecfa000
> > +       .long   0x3eda2000
> > +       .long   0x3ee48000
> > +       .long   0x3eeea000
> > +       .long   0x3ef8a000
> > +       .long   0x3f013000
> > +       .long   0x3f05f000
> > +       .long   0x3f0aa000
> > +       .long   0x3f0f4000
> > +       .long   0x3f13d000
> > +       .long   0x3f184000
> > +       .long   0x3f1ca000
> > +       .long   0x3f20f000
> > +       .long   0x3f252000
> > +       .long   0x3f295000
> > +       .long   0x3f2d7000
> > +       /* L2H = log(2)_high */
> >         .align  64
> > -       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > -       /* RcpBitMask */
> > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > +       /* L2L = log(2)_low */
> >         .align  64
> > -       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> >         /* poly_coeff3 */
> >         .align  64
> > -       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> >         /* poly_coeff2 */
> >         .align  64
> > -       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> >         /* poly_coeff1 */
> >         .align  64
> > -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > -       /* poly_coeff0 */
> > -       .align  64
> > -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > -       /* Half */
> > -       .align  64
> > -       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > -       /* L2H = log(2)_high */
> > -       .align  64
> > -       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > -       /* L2L = log(2)_low */
> > -       .align  64
> > -       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> >         .align  64
> > +       .type   __svml_satanh_data_internal_avx512_al64, @object
> > +       .size   __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
> >         .type   __svml_satanh_data_internal_avx512, @object
> >         .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S
  2022-06-09 16:01     ` H.J. Lu
@ 2022-06-09 16:56       ` Noah Goldstein
  0 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:56 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:01 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Improvements are:
> >     1. Reduce code size (-60 bytes).
> >     2. Remove redundant move instructions.
> >     3. Slightly improve instruction selection/scheduling where
> >        possible.
> >     4. Prefer registers which get short instruction encoding.
> >     5. Shrink rodata usage (-32 bytes).
> >
> > The throughput improvement is not that significant (3-5%) as the
> > port 0 bottleneck is unavoidable.
> >
> >        Function, New Time, Old Time, New / Old
> > _ZGVdN8v_atanhf,    2.799,    2.923,     0.958
> > ---
> >  .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 405 +++++++++---------
> >  1 file changed, 202 insertions(+), 203 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> > index c1ea1c3353..6113d366c2 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> > @@ -30,305 +30,304 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_satanh_data_internal
> > - */
> > +/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
> > +   by use in the function. On cold-starts this might hhelp the
> > +   prefetcher. Possibly a better idea is to interleave start/end so
> > +   that the prefetcher is less likely to detect a stream and pull
> > +   irrelivant lines into cache.  */
> >  #define SgnMask                                0
> >  #define sOne                           32
> > -#define sPoly                          64
> > -#define iBrkValue                      320
> > -#define iOffExpoMask                   352
> > -#define sHalf                          384
> > -#define sSign                          416
> > -#define sTopMask12                     448
> > -#define TinyRange                      480
> > -#define sLn2                           512
> > +#define sTopMask12                     64
> > +#define TinyRange                      96
> > +#define iBrkValue                      128
> > +#define iOffExpoMask                   160
> > +#define sPoly                          192
> > +#define sLn2                           448
> > +#define sHalf                          480
> >
> >  #include <sysdep.h>
> > +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal)
> >
> >         .section .text.avx2, "ax", @progbits
> >  ENTRY(_ZGVdN8v_atanhf_avx2)
> > -       pushq   %rbp
> > -       cfi_def_cfa_offset(16)
> > -       movq    %rsp, %rbp
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> > -       andq    $-32, %rsp
> > -       subq    $96, %rsp
> > -
> > +       /* Strip off the sign, so treat X as positive until right at the end */
> > +       vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2
> > +       vandps  %ymm2, %ymm0, %ymm3
> >         /* Load constants including One = 1 */
> > -       vmovups sOne+__svml_satanh_data_internal(%rip), %ymm5
> > -       vmovups sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
> > -       vmovaps %ymm0, %ymm6
> > +       vmovups ATANHF_DATA(sOne)(%rip), %ymm5
> > +       vsubps  %ymm3, %ymm5, %ymm1
> > +       vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4
> >
> > -       /* Strip off the sign, so treat X as positive until right at the end */
> > -       vandps  SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
> > -       vsubps  %ymm10, %ymm5, %ymm1
> > +       vrcpps  %ymm1, %ymm7
> > +       vsubps  %ymm1, %ymm5, %ymm9
> > +       vandps  %ymm4, %ymm7, %ymm6
> > +       vsubps  %ymm3, %ymm9, %ymm7
> >
> > -       /*
> > -        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> > -        * the upper part UHi being <= 12 bits long. Then we have
> > -        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> > -        */
> > -       vaddps  %ymm10, %ymm10, %ymm14
> > +       /* No need to split sU when FMA is available */
> > +       vfnmadd213ps %ymm5, %ymm6, %ymm1
> > +       vmovaps %ymm0, %ymm8
> > +       vfmadd213ps %ymm0, %ymm0, %ymm0
> > +       vfnmadd231ps %ymm6, %ymm7, %ymm1
> >
> >         /*
> >          * Check whether |X| < 1, in which case we use the main function.
> >          * Otherwise set the rangemask so that the callout will get used.
> >          * Note that this will also use the callout for NaNs since not(NaN < 1).
> >          */
> > -       vcmpnlt_uqps %ymm5, %ymm10, %ymm7
> > -       vsubps  %ymm1, %ymm5, %ymm9
> > -       vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
> > -       vrcpps  %ymm1, %ymm11
> > -       vsubps  %ymm10, %ymm9, %ymm12
> > -       vandps  %ymm13, %ymm11, %ymm0
> > +       vcmpnlt_uqps %ymm5, %ymm3, %ymm14
> > +       vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
> >
> > -       /* No need to split sU when FMA is available */
> > -       vfnmadd213ps %ymm5, %ymm0, %ymm1
> > -       vmovaps %ymm6, %ymm8
> > -       vfmadd213ps %ymm6, %ymm6, %ymm8
> > -       vfnmadd231ps %ymm0, %ymm12, %ymm1
> > +       /*
> > +        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> > +        * the upper part UHi being <= 12 bits long. Then we have
> > +        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> > +        */
> > +       vaddps  %ymm3, %ymm3, %ymm3
> >
> >         /*
> >          * Split V as well into upper 12 bits and lower part, so that we can get
> >          * a preliminary quotient estimate without rounding error.
> >          */
> > -       vandps  %ymm13, %ymm14, %ymm15
> > -       vmovmskps %ymm7, %edx
> > -       vsubps  %ymm15, %ymm14, %ymm7
> > +       vandps  %ymm4, %ymm3, %ymm4
> > +       vsubps  %ymm4, %ymm3, %ymm7
> >
> >         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> > -       vmulps  %ymm15, %ymm0, %ymm10
> > +       vmulps  %ymm4, %ymm6, %ymm4
> >
> >         /* Compute D = E + E^2 */
> >         vfmadd213ps %ymm1, %ymm1, %ymm1
> >
> > -       /* Record the sign for eventual reincorporation. */
> > -       vandps  sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
> > +       /* Record the sign for eventual reincorporation.  */
> > +       vandnps %ymm8, %ymm2, %ymm3
> >
> >         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> > -       vorps   %ymm3, %ymm8, %ymm2
> > -       vmulps  %ymm7, %ymm0, %ymm8
> > +       vorps   %ymm3, %ymm0, %ymm13
> > +       vmulps  %ymm7, %ymm6, %ymm2
> >
> >         /*
> >          * Compute R * (VHi + VLo) * (1 + E + E^2)
> >          * = R *  (VHi + VLo) * (1 + D)
> >          * = QHi + (QHi * D + QLo + QLo * D)
> >          */
> > -       vmulps  %ymm1, %ymm10, %ymm9
> > -       vfmadd213ps %ymm8, %ymm8, %ymm1
> > -       vaddps  %ymm1, %ymm9, %ymm1
> >
> > -       /* reduction: compute r, n */
> > -       vmovups iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
> > +       /*
> > +        * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
> > +        * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
> > +        * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
> > +        */
> > +       vmulps  %ymm1, %ymm4, %ymm6
> > +       vfmadd213ps %ymm2, %ymm2, %ymm1
> > +       vaddps  %ymm1, %ymm6, %ymm1
> >
> >         /*
> >          * Now finally accumulate the high and low parts of the
> >          * argument to log1p, H + L, with a final compensated summation.
> >          */
> > -       vaddps  %ymm1, %ymm10, %ymm12
> > -       vsubps  %ymm12, %ymm10, %ymm11
> > +       vaddps  %ymm1, %ymm4, %ymm2
> > +
> > +       /* reduction: compute r, n */
> > +       vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9
> >
> >         /*
> >          * Now we feed into the log1p code, using H in place of _VARG1 and
> >          * later incorporating L into the reduced argument.
> >          * compute 1+x as high, low parts
> >          */
> > -       vmaxps  %ymm12, %ymm5, %ymm13
> > -       vminps  %ymm12, %ymm5, %ymm14
> > -       vaddps  %ymm11, %ymm1, %ymm0
> > -       vaddps  %ymm14, %ymm13, %ymm1
> > -       vpsubd  %ymm9, %ymm1, %ymm7
> > -       vsubps  %ymm1, %ymm13, %ymm15
> > -       vpsrad  $23, %ymm7, %ymm10
> > -       vpand   iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
> > -       vaddps  %ymm15, %ymm14, %ymm13
> > -       vpslld  $23, %ymm10, %ymm11
> > -       vpaddd  %ymm9, %ymm8, %ymm15
> > -       vaddps  %ymm13, %ymm0, %ymm14
> > -       vcvtdq2ps %ymm10, %ymm0
> > -       vpsubd  %ymm11, %ymm5, %ymm12
> > +       vmaxps  %ymm2, %ymm5, %ymm0
> > +       vminps  %ymm2, %ymm5, %ymm6
> > +
> > +       /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
> > +       vsubps  %ymm2, %ymm4, %ymm2
> > +       vaddps  %ymm6, %ymm0, %ymm4
> > +       vpsubd  %ymm9, %ymm4, %ymm7
> > +       vsubps  %ymm4, %ymm0, %ymm4
> > +       vaddps  %ymm2, %ymm1, %ymm2
> > +       vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
> > +
> > +       vandps  %ymm1, %ymm7, %ymm0
> > +       vaddps  %ymm4, %ymm6, %ymm4
> > +       vandnps %ymm7, %ymm1, %ymm6
> > +       vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1
> > +       vpaddd  %ymm9, %ymm0, %ymm0
> > +       vaddps  %ymm4, %ymm2, %ymm4
> > +       vpsubd  %ymm6, %ymm5, %ymm6
> >
> >         /* polynomial evaluation */
> > -       vsubps  %ymm5, %ymm15, %ymm5
> > -       vmulps  %ymm14, %ymm12, %ymm1
> > -       vaddps  %ymm5, %ymm1, %ymm5
> > -       vmovups sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
> > -       vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> > -       vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> > -       vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> > -       vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> > -       vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> > -       vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> > -       vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> > -       vmulps  %ymm1, %ymm5, %ymm7
> > -       vfmadd213ps %ymm5, %ymm5, %ymm7
> > +       vsubps  %ymm5, %ymm0, %ymm2
> > +       vfmadd231ps %ymm4, %ymm6, %ymm2
> > +       vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
> > +       vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
> > +       vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
> > +       vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
> > +       vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
> > +       vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
> > +       vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
> > +
> > +       vmulps  %ymm1, %ymm2, %ymm1
> > +       vfmadd213ps %ymm2, %ymm2, %ymm1
> >
> >         /* final reconstruction */
> > -       vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
> > +       vpsrad  $23, %ymm7, %ymm6
> > +       vcvtdq2ps %ymm6, %ymm2
> > +       vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
> >
> >         /* Finally, halve the result and reincorporate the sign */
> > -       vxorps  sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
> > -       vmulps  %ymm0, %ymm3, %ymm0
> > -       vblendvps %ymm4, %ymm2, %ymm0, %ymm0
> > +       vxorps  ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
> > +       vmulps  %ymm2, %ymm3, %ymm2
> > +       vmovmskps %ymm14, %edx
> >         testl   %edx, %edx
> >
> > +       vblendvps %ymm15, %ymm13, %ymm2, %ymm0
> >         /* Go to special inputs processing branch */
> >         jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
> > -
> > -       /* Restore registers
> > -        * and exit the function
> > -        */
> > -
> > -L(EXIT):
> > -       movq    %rbp, %rsp
> > -       popq    %rbp
> > -       cfi_def_cfa(7, 8)
> > -       cfi_restore(6)
> > +       # LOE rbx rdx r12 r13 r14 r15 ymm0
> > +       /* No registers to restore on fast path.  */
> >         ret
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> >
> > -       /* Branch to process
> > -        * special inputs
> > -        */
> >
> > +       /* Cold case. edx has 1s where there was a special value that
> > +          needs to be handled by a atanhf call. Optimize for code size
> > +          moreso than speed here. */
> >  L(SPECIAL_VALUES_BRANCH):
> > -       vmovups %ymm6, 32(%rsp)
> > -       vmovups %ymm0, 64(%rsp)
> > -       # LOE rbx r12 r13 r14 r15 edx ymm0
> > -
> > -       xorl    %eax, %eax
> > -       # LOE rbx r12 r13 r14 r15 eax edx
> > -
> > -       vzeroupper
> > -       movq    %r12, 16(%rsp)
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> > -       movl    %eax, %r12d
> > -       movq    %r13, 8(%rsp)
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> > -       movl    %edx, %r13d
> > -       movq    %r14, (%rsp)
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       /* Range mask
> > -        * bits check
> > +       # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
> > +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > +       callee save register saving code size. */
> > +       pushq   %r13
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(r13, -16)
> > +       /* Need to callee save registers to preserve state across tanhf calls.
> >          */
> > +       pushq   %rbx
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbx, -24)
> > +       pushq   %rbp
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbp, -32)
> > +       movq    %rsp, %r13
> > +       cfi_def_cfa_register(r13)
> >
> > -L(RANGEMASK_CHECK):
> > -       btl     %r12d, %r13d
> > +       /* Align stack and make room for 2x ymm vectors.  */
> > +       andq    $-32, %rsp
> > +       addq    $-64, %rsp
> >
> > -       /* Call scalar math function */
> > -       jc      L(SCALAR_MATH_CALL)
> > -       # LOE rbx r15 r12d r13d
> > +       /* Save all already computed inputs.  */
> > +       vmovups %ymm0, (%rsp)
> > +       /* Save origional input (ymm8 unchanged up to this point).  */
> > +       vmovups %ymm8, 32(%rsp)
> >
> > -       /* Special inputs
> > -        * processing loop
> > -        */
> > +       vzeroupper
> >
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a atanhf call.  */
> > +       movl    %edx, %ebx
> >  L(SPECIAL_VALUES_LOOP):
> > -       incl    %r12d
> > -       cmpl    $8, %r12d
> > -
> > -       /* Check bits in range mask */
> > -       jl      L(RANGEMASK_CHECK)
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       movq    16(%rsp), %r12
> > -       cfi_restore(12)
> > -       movq    8(%rsp), %r13
> > -       cfi_restore(13)
> > -       movq    (%rsp), %r14
> > -       cfi_restore(14)
> > -       vmovups 64(%rsp), %ymm0
> > -
> > -       /* Go to exit */
> > -       jmp     L(EXIT)
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r12 r13 r14 r15 ymm0
> > -
> > -       /* Scalar math fucntion call
> > -        * to process special input
> > -        */
> > -
> > -L(SCALAR_MATH_CALL):
> > -       movl    %r12d, %r14d
> > -       movss   32(%rsp, %r14, 4), %xmm0
> > +       # LOE rbx rbp r12 r13 r14 r15
> > +       /* use rbp as index for special value that is saved across calls to
> > +          atanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop. Realigning also costs more code size.  */
> > +       xorl    %ebp, %ebp
> > +       tzcntl  %ebx, %ebp
> > +
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   32(%rsp, %rbp, 4), %xmm0
> >         call    atanhf@PLT
> > -       # LOE rbx r14 r15 r12d r13d xmm0
> >
> > -       movss   %xmm0, 64(%rsp, %r14, 4)
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, (%rsp, %rbp, 4)
> > +
> > +       blsrl   %ebx, %ebx
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +       # LOE r12 r13 r14 r15
> > +
> >
> > -       /* Process special inputs in loop */
> > -       jmp     L(SPECIAL_VALUES_LOOP)
> > -       # LOE rbx r15 r12d r13d
> > +       /* All results have been written to 32(%rsp).  */
>                                                                 Why 32
> here?  Did you mean 32 bytes at %rsp?


Correct. Fixed in V2.
> > +       vmovups (%rsp), %ymm0
> > +       /* Restore rsp.  */
> > +       movq    %r13, %rsp
> > +       cfi_def_cfa_register(rsp)
> > +       /* Restore callee save registers.  */
> > +       popq    %rbp
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %rbx
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %r13
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(r13)
> > +       ret
> >  END(_ZGVdN8v_atanhf_avx2)
> >
> >         .section .rodata, "a"
> >         .align  32
> > -
> >  #ifdef __svml_satanh_data_internal_typedef
> >  typedef unsigned int VUINT32;
> > -typedef struct {
> > +typedef struct{
> >         __declspec(align(32)) VUINT32 SgnMask[8][1];
> >         __declspec(align(32)) VUINT32 sOne[8][1];
> > -       __declspec(align(32)) VUINT32 sPoly[8][8][1];
> > -       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> > -       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> > -       __declspec(align(32)) VUINT32 sHalf[8][1];
> > -       __declspec(align(32)) VUINT32 sSign[8][1];
> >         __declspec(align(32)) VUINT32 sTopMask12[8][1];
> >         __declspec(align(32)) VUINT32 TinyRange[8][1];
> > +       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> > +       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> > +       __declspec(align(32)) VUINT32 sPoly[8][8][1];
> >         __declspec(align(32)) VUINT32 sLn2[8][1];
> > +       __declspec(align(32)) VUINT32 sHalf[8][1];
> >  } __svml_satanh_data_internal;
> >  #endif
> >  __svml_satanh_data_internal:
> >         /* SgnMask */
> > -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> >         /* sOne = SP 1.0 */
> >         .align  32
> > -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > -       /* sPoly[] = SP polynomial */
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       /* sTopMask12 */
> > +       .align  32
> > +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> > +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> > +       /* TinyRange */
> >         .align  32
> > -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> > -       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> > -       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> > -       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> > -       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> > -       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> > -       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> > -       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> > +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> > +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> >         /* iBrkValue = SP 2/3 */
> >         .align  32
> > -       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> > +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> > +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> >         /* iOffExpoMask = SP significand mask */
> >         .align  32
> > -       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> > -       /* sHalf */
> > -       .align  32
> > -       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> > -       /* sSign */
> > -       .align  32
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       /* sTopMask12 */
> > -       .align  32
> > -       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> > -       /* TinyRange */
> > +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> > +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> > +       /* sPoly[] = SP polynomial */
> >         .align  32
> > -       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> > +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
> > +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> > +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
> > +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> > +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
> > +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> > +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
> > +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> > +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
> > +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> > +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
> > +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> > +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
> > +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> >         /* sLn2 = SP ln(2) */
> >         .align  32
> > -       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> > +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> > +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> > +       /* sHalf */
> > +       .align  32
> > +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> > +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> >         .align  32
> >         .type   __svml_satanh_data_internal, @object
> >         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 3/7] x86: Improve svml_s_atanhf4_core_sse4.S
  2022-06-09 16:03     ` H.J. Lu
@ 2022-06-09 16:56       ` Noah Goldstein
  0 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:56 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Improvements are:
> >     1. Reduce code size (-62 bytes).
> >     2. Remove redundant move instructions.
> >     3. Slightly improve instruction selection/scheduling where
> >        possible.
> >     4. Prefer registers which get short instruction encoding.
> >     5. Reduce rodata usage (-16 bytes).
> >
> > The throughput improvement is not significant as the port 0 bottleneck
> > is unavoidable.
> >
> >        Function, New Time, Old Time, New / Old
> > _ZGVbN4v_atanhf,    8.821,    8.903,     0.991
> > ---
> >  .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 378 ++++++++----------
> >  1 file changed, 169 insertions(+), 209 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> > index 2d3ad2617f..e6683785fb 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> > @@ -30,96 +30,80 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_satanh_data_internal
> > - */
> > -#define SgnMask                                0
> > -#define sOne                           16
> > -#define sPoly                          32
> > -#define iBrkValue                      160
> > -#define iOffExpoMask                   176
> > -#define sHalf                          192
> > -#define sSign                          208
> > -#define sTopMask12                     224
> > -#define TinyRange                      240
> > -#define sLn2                           256
> > +/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
> > +   by use in the function. On cold-starts this might help the
> > +   prefetcher. Possibly a better idea is to interleave start/end so
> > +   that the prefetcher is less likely to detect a stream and pull
> > +   irrelivant lines into cache.  */
> > +#define sOne                           0
> > +#define SgnMask                                16
> > +#define sTopMask12                     32
> > +#define iBrkValue                      48
> > +#define iOffExpoMask                   64
> > +#define sPoly                          80
> > +#define sLn2                           208
> > +#define TinyRange                      224
> >
> >  #include <sysdep.h>
> > +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal)
> >
> >         .section .text.sse4, "ax", @progbits
> >  ENTRY(_ZGVbN4v_atanhf_sse4)
> > -       subq    $72, %rsp
> > -       cfi_def_cfa_offset(80)
> >         movaps  %xmm0, %xmm5
> >
> >         /* Load constants including One = 1 */
> > -       movups  sOne+__svml_satanh_data_internal(%rip), %xmm4
> > +       movups  ATANHF_DATA(sOne)(%rip), %xmm4
> >         movaps  %xmm5, %xmm3
> >
> >         /* Strip off the sign, so treat X as positive until right at the end */
> > -       movups  SgnMask+__svml_satanh_data_internal(%rip), %xmm7
> > -       movaps  %xmm4, %xmm8
> > -       andps   %xmm5, %xmm7
> > +       movups  ATANHF_DATA(SgnMask)(%rip), %xmm1
> > +       movaps  %xmm4, %xmm2
> > +       andps   %xmm1, %xmm0
> >         movaps  %xmm4, %xmm10
> > -       movups  sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
> > +       movups  ATANHF_DATA(sTopMask12)(%rip), %xmm11
> >         movaps  %xmm4, %xmm14
> >         movaps  %xmm11, %xmm9
> >
> > +
> >         /*
> >          * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> >          * the upper part UHi being <= 12 bits long. Then we have
> >          * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> >          */
> > -       movaps  %xmm7, %xmm12
> > +       movaps  %xmm0, %xmm6
> > +       mulps   %xmm5, %xmm3
> > +       subps   %xmm0, %xmm2
> > +       addps   %xmm0, %xmm6
> > +       subps   %xmm2, %xmm10
> > +       addps   %xmm5, %xmm3
> > +       subps   %xmm0, %xmm10
> > +       andps   %xmm2, %xmm9
> > +
> >
> >         /*
> >          * Check whether |X| < 1, in which case we use the main function.
> >          * Otherwise set the rangemask so that the callout will get used.
> >          * Note that this will also use the callout for NaNs since not(NaN < 1).
> >          */
> > -       movaps  %xmm7, %xmm6
> > -       movaps  %xmm7, %xmm2
> > -       cmpnltps %xmm4, %xmm6
> > -       cmpltps TinyRange+__svml_satanh_data_internal(%rip), %xmm2
> > -       mulps   %xmm5, %xmm3
> > -       subps   %xmm7, %xmm8
> > -       addps   %xmm7, %xmm12
> > -       movmskps %xmm6, %edx
> > -       subps   %xmm8, %xmm10
> > -       addps   %xmm5, %xmm3
> > -       subps   %xmm7, %xmm10
> > -       andps   %xmm8, %xmm9
> > +       rcpps   %xmm9, %xmm7
> > +       subps   %xmm9, %xmm2
> > +       andps   %xmm11, %xmm7
> >
> > -       /*
> > -        * Now we feed into the log1p code, using H in place of _VARG1 and
> > -        * later incorporating L into the reduced argument.
> > -        * compute 1+x as high, low parts
> > -        */
> > -       movaps  %xmm4, %xmm7
> > -
> > -       /*
> > -        * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
> > -        * The first FMR is exact (we force R to 12 bits just in case it
> > -        * isn't already, to make absolutely sure), and since E is ~ 2^-12,
> > -        * the rounding error in the other one is acceptable.
> > -        */
> > -       rcpps   %xmm9, %xmm15
> > -       subps   %xmm9, %xmm8
> > -       andps   %xmm11, %xmm15
> >
> >         /*
> >          * Split V as well into upper 12 bits and lower part, so that we can get
> >          * a preliminary quotient estimate without rounding error.
> >          */
> > -       andps   %xmm12, %xmm11
> > -       mulps   %xmm15, %xmm9
> > -       addps   %xmm8, %xmm10
> > -       subps   %xmm11, %xmm12
> > +       andps   %xmm6, %xmm11
> > +       mulps   %xmm7, %xmm9
> > +       addps   %xmm2, %xmm10
> > +       subps   %xmm11, %xmm6
> >
> >         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> > -       mulps   %xmm15, %xmm11
> > -       mulps   %xmm15, %xmm10
> > +       mulps   %xmm7, %xmm11
> > +       mulps   %xmm7, %xmm10
> >         subps   %xmm9, %xmm14
> > -       mulps   %xmm12, %xmm15
> > +       mulps   %xmm6, %xmm7
> >         subps   %xmm10, %xmm14
> >
> >         /* Compute D = E + E^2 */
> > @@ -127,8 +111,8 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
> >         movaps  %xmm4, %xmm8
> >         mulps   %xmm14, %xmm13
> >
> > -       /* reduction: compute r, n */
> > -       movdqu  iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
> > +       /* reduction: compute r,n */
> > +       movdqu  ATANHF_DATA(iBrkValue)(%rip), %xmm9
> >         addps   %xmm13, %xmm14
> >
> >         /*
> > @@ -136,168 +120,149 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
> >          * = R *  (VHi + VLo) * (1 + D)
> >          * = QHi + (QHi * D + QLo + QLo * D)
> >          */
> > -       movaps  %xmm14, %xmm0
> > -       mulps   %xmm15, %xmm14
> > -       mulps   %xmm11, %xmm0
> > -       addps   %xmm14, %xmm15
> > -       movdqu  iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
> > +       movaps  %xmm14, %xmm2
> > +       mulps   %xmm7, %xmm14
> > +       mulps   %xmm11, %xmm2
> > +       addps   %xmm14, %xmm7
> > +       movdqu  ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
> >         movaps  %xmm4, %xmm14
> >
> >         /* Record the sign for eventual reincorporation. */
> > -       movups  sSign+__svml_satanh_data_internal(%rip), %xmm1
> > -       addps   %xmm15, %xmm0
> > +       addps   %xmm7, %xmm2
> > +
> >
> >         /*
> >          * Now finally accumulate the high and low parts of the
> >          * argument to log1p, H + L, with a final compensated summation.
> >          */
> > -       movaps  %xmm0, %xmm6
> > -       andps   %xmm5, %xmm1
> > -
> > +       movaps  %xmm2, %xmm6
> > +       andnps  %xmm5, %xmm1
> > +       movaps  %xmm4, %xmm7
> >         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> > -       orps    %xmm1, %xmm3
> >         addps   %xmm11, %xmm6
> >         maxps   %xmm6, %xmm7
> >         minps   %xmm6, %xmm8
> >         subps   %xmm6, %xmm11
> >         movaps  %xmm7, %xmm10
> > -       andps   %xmm2, %xmm3
> >         addps   %xmm8, %xmm10
> > -       addps   %xmm11, %xmm0
> > +       addps   %xmm11, %xmm2
> >         subps   %xmm10, %xmm7
> >         psubd   %xmm9, %xmm10
> > -       addps   %xmm7, %xmm8
> > +       addps   %xmm8, %xmm7
> >         pand    %xmm10, %xmm12
> >         psrad   $23, %xmm10
> >         cvtdq2ps %xmm10, %xmm13
> > -       addps   %xmm8, %xmm0
> > +       addps   %xmm7, %xmm2
> >
> >         /* final reconstruction */
> > -       mulps   sLn2+__svml_satanh_data_internal(%rip), %xmm13
> >         pslld   $23, %xmm10
> >         paddd   %xmm9, %xmm12
> >         psubd   %xmm10, %xmm14
> >
> >         /* polynomial evaluation */
> >         subps   %xmm4, %xmm12
> > -       mulps   %xmm0, %xmm14
> > -       movups  sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
> > -       addps   %xmm12, %xmm14
> > -       mulps   %xmm14, %xmm0
> > +       mulps   %xmm14, %xmm2
> > +       movups  ATANHF_DATA(sPoly+0)(%rip), %xmm7
> > +       addps   %xmm12, %xmm2
> > +       mulps   %xmm2, %xmm7
> > +
> >
> >         /* Finally, halve the result and reincorporate the sign */
> > -       movups  sHalf+__svml_satanh_data_internal(%rip), %xmm4
> > -       pxor    %xmm1, %xmm4
> > -       addps   sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       addps   sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       addps   sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       addps   sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       addps   sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       addps   sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       addps   sPoly+__svml_satanh_data_internal(%rip), %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       mulps   %xmm14, %xmm0
> > -       addps   %xmm0, %xmm14
> > -       movaps  %xmm2, %xmm0
> > -       addps   %xmm13, %xmm14
> > -       mulps   %xmm14, %xmm4
> > -       andnps  %xmm4, %xmm0
> > -       orps    %xmm3, %xmm0
> > -       testl   %edx, %edx
> > +       addps   ATANHF_DATA(sPoly+16)(%rip), %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       addps   ATANHF_DATA(sPoly+32)(%rip), %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       addps   ATANHF_DATA(sPoly+48)(%rip), %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       addps   ATANHF_DATA(sPoly+64)(%rip), %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       addps   ATANHF_DATA(sPoly+80)(%rip), %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       addps   ATANHF_DATA(sPoly+96)(%rip), %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       movaps  ATANHF_DATA(sPoly+112)(%rip), %xmm6
> > +       addps   %xmm6, %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       mulps   %xmm2, %xmm7
> > +       mulps   ATANHF_DATA(sLn2)(%rip), %xmm13
> > +       /* We can build `sHalf` with `sPoly & sOne`.  */
> > +       andps   %xmm4, %xmm6
> > +       orps    %xmm1, %xmm3
> > +       xorps   %xmm6, %xmm1
> >
> > -       /* Go to special inputs processing branch */
> > -       jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
> > +       addps   %xmm2, %xmm7
> > +       addps   %xmm13, %xmm7
> > +       mulps   %xmm7, %xmm1
> >
> > -       /* Restore registers
> > -        * and exit the function
> > -        */
> > +       /* Finish check of NaNs.  */
> > +       cmpleps %xmm0, %xmm4
> > +       movmskps %xmm4, %edx
> > +       cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
> >
> > -L(EXIT):
> > -       addq    $72, %rsp
> > -       cfi_def_cfa_offset(8)
> > +       andps   %xmm0, %xmm3
> > +       andnps  %xmm1, %xmm0
> > +       orps    %xmm3, %xmm0
> > +
> > +       testl   %edx, %edx
> > +       /* Go to special inputs processing branch.  */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +       # LOE rbx rbp r12 r13 r14 r15 xmm0
> > +       /* No registers to restore on fast path.  */
> >         ret
> > -       cfi_def_cfa_offset(80)
> >
> > -       /* Branch to process
> > -        * special inputs
> > -        */
> >
> > +       /* Cold case. edx has 1s where there was a special value that
> > +          needs to be handled by a atanhf call. Optimize for code size
> > +          moreso than speed here. */
> >  L(SPECIAL_VALUES_BRANCH):
> > -       movups  %xmm5, 32(%rsp)
> > -       movups  %xmm0, 48(%rsp)
> > -       # LOE rbx rbp r12 r13 r14 r15 edx
> > -
> > -       xorl    %eax, %eax
> > -       movq    %r12, 16(%rsp)
> > -       cfi_offset(12, -64)
> > -       movl    %eax, %r12d
> > -       movq    %r13, 8(%rsp)
> > -       cfi_offset(13, -72)
> > -       movl    %edx, %r13d
> > -       movq    %r14, (%rsp)
> > -       cfi_offset(14, -80)
> > -       # LOE rbx rbp r15 r12d r13d
> > -
> > -       /* Range mask
> > -        * bits check
> > -        */
> > -
> > -L(RANGEMASK_CHECK):
> > -       btl     %r12d, %r13d
> > -
> > -       /* Call scalar math function */
> > -       jc      L(SCALAR_MATH_CALL)
> > -       # LOE rbx rbp r15 r12d r13d
> > -
> > -       /* Special inputs
> > -        * processing loop
> > -        */
> > -
> > +       # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
> > +       /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
> > +       call entry will be 16-byte aligned. */
> > +       subq    $56, %rsp
> > +       cfi_def_cfa_offset(64)
> > +       movups  %xmm0, 24(%rsp)
> > +       movups  %xmm5, 40(%rsp)
> > +
> > +       /* Use rbx/rbp for callee save registers as they get short
> > +       encoding for many instructions (as compared with r12/r13). */
> > +       movq    %rbx, (%rsp)
> > +       cfi_offset(rbx, -64)
> > +       movq    %rbp, 8(%rsp)
> > +       cfi_offset(rbp, -56)
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a tanhf call.  */
> > +       movl    %edx, %ebx
> >  L(SPECIAL_VALUES_LOOP):
> > -       incl    %r12d
> > -       cmpl    $4, %r12d
> > -
> > -       /* Check bits in range mask */
> > -       jl      L(RANGEMASK_CHECK)
> > -       # LOE rbx rbp r15 r12d r13d
> > -
> > -       movq    16(%rsp), %r12
> > -       cfi_restore(12)
> > -       movq    8(%rsp), %r13
> > -       cfi_restore(13)
> > -       movq    (%rsp), %r14
> > -       cfi_restore(14)
> > -       movups  48(%rsp), %xmm0
> > -
> > -       /* Go to exit */
> > -       jmp     L(EXIT)
> > -       cfi_offset(12, -64)
> > -       cfi_offset(13, -72)
> > -       cfi_offset(14, -80)
> > -       # LOE rbx rbp r12 r13 r14 r15 xmm0
> > -
> > -       /* Scalar math fucntion call
> > -        * to process special input
> > -        */
> > -
> > -L(SCALAR_MATH_CALL):
> > -       movl    %r12d, %r14d
> > -       movss   32(%rsp, %r14, 4), %xmm0
> > +       # LOE rbx rbp r12 r13 r14 r15
> > +       /* use rbp as index for special value that is saved across calls to
> > +          tanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop.  */
> > +       xorl    %ebp, %ebp
> > +       bsfl    %ebx, %ebp
> > +
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   40(%rsp, %rbp, 4), %xmm0
> >         call    atanhf@PLT
> > -       # LOE rbx rbp r14 r15 r12d r13d xmm0
> > -
> > -       movss   %xmm0, 48(%rsp, %r14, 4)
> > -
> > -       /* Process special inputs in loop */
> > -       jmp     L(SPECIAL_VALUES_LOOP)
> > -       # LOE rbx rbp r15 r12d r13d
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, 24(%rsp, %rbp, 4)
> > +
> > +       leal    -1(%rbx), %eax
> > +       andl    %eax, %ebx
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +       # LOE r12 r13 r14 r15
> > +       /* All results have been written to 16(%rsp).  */
>
> Where does 16 come from?

Incorrect from prior version. Fixed in V2.
> > +       movups  24(%rsp), %xmm0
> > +       movq    (%rsp), %rbx
> > +       cfi_restore(rbx)
> > +       movq    8(%rsp), %rbp
> > +       cfi_restore(rbp)
> > +       addq    $56, %rsp
> > +       cfi_def_cfa_offset(8)
> > +       ret
> >  END(_ZGVbN4v_atanhf_sse4)
> >
> >         .section .rodata, "a"
> > @@ -305,56 +270,51 @@ END(_ZGVbN4v_atanhf_sse4)
> >
> >  #ifdef __svml_satanh_data_internal_typedef
> >  typedef unsigned int VUINT32;
> > -typedef struct {
> > -       __declspec(align(16)) VUINT32 SgnMask[4][1];
> > +typedef struct{
> >         __declspec(align(16)) VUINT32 sOne[4][1];
> > -       __declspec(align(16)) VUINT32 sPoly[8][4][1];
> > +       __declspec(align(16)) VUINT32 SgnMask[4][1];
> > +       __declspec(align(16)) VUINT32 sTopMask12[4][1];
> >         __declspec(align(16)) VUINT32 iBrkValue[4][1];
> >         __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
> > -       __declspec(align(16)) VUINT32 sHalf[4][1];
> > -       __declspec(align(16)) VUINT32 sSign[4][1];
> > -       __declspec(align(16)) VUINT32 sTopMask12[4][1];
> > -       __declspec(align(16)) VUINT32 TinyRange[4][1];
> > +       __declspec(align(16)) VUINT32 sPoly[8][4][1];
> >         __declspec(align(16)) VUINT32 sLn2[4][1];
> > +       __declspec(align(16)) VUINT32 TinyRange[4][1];
> >  } __svml_satanh_data_internal;
> >  #endif
> > +
> >  __svml_satanh_data_internal:
> > -       /* SgnMask */
> > -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> >         /* sOne = SP 1.0 */
> >         .align  16
> >         .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > -       /* sPoly[] = SP polynomial */
> > +       /* SgnMask */
> > +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > +       /* sTopMask12 */
> >         .align  16
> > -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> > -       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> > -       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> > -       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> > -       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> > -       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> > -       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> > -       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> > +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> >         /* iBrkValue = SP 2/3 */
> >         .align  16
> >         .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> > -       /* iOffExpoMask = SP significand mask */
> > +       /* iOffExpoMask = SP significand mask ==*/
> >         .align  16
> >         .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> > -       /* sHalf */
> > -       .align  16
> > -       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> > -       /* sSign */
> > +
> > +       /* sPoly[] = SP polynomial */
> >         .align  16
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       /* sTopMask12 */
> > +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> > +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> > +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> > +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> > +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> > +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> > +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> > +
> > +       /* sLn2 = SP ln(2) */
> >         .align  16
> > -       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> > +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> >         /* TinyRange */
> >         .align  16
> >         .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> > -       /* sLn2 = SP ln(2) */
> > -       .align  16
> > -       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> >         .align  16
> >         .type   __svml_satanh_data_internal, @object
> >         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S
  2022-06-09 16:04     ` H.J. Lu
@ 2022-06-09 16:57       ` Noah Goldstein
  0 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:57 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:05 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Optimizations are:
> >     1. Reduce code size (-67 bytes).
> >     2. Remove redundant move instructions.
> >     3. Slightly improve instruction selection/scheduling where
> >        possible.
> >     4. Reduce rodata usage (-448 bytes).
> >
> > Result is roughly a 14% speedup:
> >
> >        Function, New Time, Old Time, New / Old
> > _ZGVeN16v_tanhf,    0.649,    0.752,     0.863
> > ---
> >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 527 ++++++++++--------
> >  1 file changed, 287 insertions(+), 240 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > index 5b1f9f151c..d55798767c 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > @@ -70,310 +70,357 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal
> > +/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > +   by use in the function. On cold-starts this might help the
> > +   prefetcher. Possibly a better idea is to interleave start/end so
> > +   that the prefetcher is less likely to detect a stream and pull
> > +   irrelivant lines into cache.  */
> > +
> > +/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> >   */
> > -#define _sC                            0
> > -#define _sP0                           128
> > -#define _sP2                           256
> > -#define _sP3                           384
> > -#define _sP4                           512
> > -#define _sP5                           640
> > -#define _sP6                           768
> > -#define _sP7                           896
> > -#define _iExpMantMask_UISA             1024
> > -#define _iMinIdxOfsMask_UISA           1088
> > -#define _iMaxIdxMask_UISA              1152
> > -#define _sSignMask                     1216
> > -#define _sAbsMask                      1280
> > -#define _iExpMantMask                  1344
> > -#define _iExpMask                      1408
> > -#define _iMinIdxOfsMask                        1472
> > -#define _iMaxIdxMask                   1536
> > +#define _iExpMantMask_UISA             0
> > +#define _iMinIdxOfsMask_UISA           4
> > +#define _iMaxIdxMask_UISA              8
> > +#define _iExpMask                      12
> > +
> > +/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > +   each.  */
> > +#define _sC_lo                         0
> > +#define _sC_hi                         64
> > +#define _sP7_lo                                128
> > +#define _sP7_hi                                192
> > +#define _sSignMask                     256
> > +#define _sP6_lo                                320
> > +#define _sP6_hi                                384
> > +#define _sP5_lo                                448
> > +#define _sP5_hi                                512
> > +#define _sP4_lo                                576
> > +#define _sP4_hi                                640
> > +#define _sP3_lo                                704
> > +#define _sP3_hi                                768
> > +#define _sP2_lo                                832
> > +#define _sP2_hi                                896
> > +#define _sP0_lo                                960
> > +#define _sP0_hi                                1024
> >
> >  #include <sysdep.h>
> > +#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > +#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> >
> >         .section .text.exex512, "ax", @progbits
> >  ENTRY(_ZGVeN16v_tanhf_skx)
> > -       pushq   %rbp
> > -       cfi_def_cfa_offset(16)
> > -       movq    %rsp, %rbp
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> > -       andq    $-64, %rsp
> > -       subq    $192, %rsp
> > -       vmovaps %zmm0, %zmm1
> > -       vmovups __svml_stanh_data_internal(%rip), %zmm9
> > -       vmovups _sP6+__svml_stanh_data_internal(%rip), %zmm11
> > -       vmovups _sP5+__svml_stanh_data_internal(%rip), %zmm12
> > -       vmovups _sP4+__svml_stanh_data_internal(%rip), %zmm13
> > -       vmovups _sP3+__svml_stanh_data_internal(%rip), %zmm14
> > -       vmovups _sP2+__svml_stanh_data_internal(%rip), %zmm15
> > -       vpternlogd $255, %zmm2, %zmm2, %zmm2
> > -       vandps  _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> > -       vandps  _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> > -
> >         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -       vpandd  _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> > -       vpsubd  _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> > -       vpcmpd  $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> > +       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > +       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> >
> > -       /*
> > -        *  small table specific variables *
> > -        *  Constant loading
> > -        */
> > -       vpxord  %zmm5, %zmm5, %zmm5
> > -
> > -       /* if VMIN, VMAX is defined for I type */
> > -       vpmaxsd %zmm5, %zmm4, %zmm6
> > -       vpminsd _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> > -       vpsrld  $21, %zmm7, %zmm10
> > -       vmovups _sP7+__svml_stanh_data_internal(%rip), %zmm4
> > -       vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> > -       vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> > -       vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> > -       vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> > -       vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> > -       vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> > -       vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> > -       vpandnd %zmm3, %zmm3, %zmm2{%k1}
> > -       vptestmd %zmm2, %zmm2, %k0
> > -       vmovups _sP0+__svml_stanh_data_internal(%rip), %zmm3
> > -       vsubps  {rn-sae}, %zmm9, %zmm8, %zmm2
> > -       kmovw   %k0, %edx
> > -       vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> > -       vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> > -       vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> > -       vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> > -       vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> > -       vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> > -       vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> > -       vorps   %zmm0, %zmm4, %zmm0
> > -       testl   %edx, %edx
> > +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > +       vpxord  %zmm3, %zmm3, %zmm3
> > +       vpmaxsd %zmm3, %zmm2, %zmm3
> > +       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> >
> > -       /* Go to special inputs processing branch */
> > -       jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> > +       /* Setup permute indices in zmm3.  */
> > +       vpsrld  $21, %zmm3, %zmm3
> >
> > -       /* Restore registers
> > -        * and exit the function
> > -        */
> > +       /* Store if there are any special cases in k1.  */
> > +       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> >
> > -L(EXIT):
> > -       movq    %rbp, %rsp
> > -       popq    %rbp
> > -       cfi_def_cfa(7, 8)
> > -       cfi_restore(6)
> > -       ret
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> > +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> >
> > -       /* Branch to process
> > -        * special inputs
> > -        */
> > +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> >
> > -L(SPECIAL_VALUES_BRANCH):
> > -       vmovups %zmm1, 64(%rsp)
> > -       vmovups %zmm0, 128(%rsp)
> > -       # LOE rbx r12 r13 r14 r15 edx zmm0
> > +       /* Store absolute values of inputs in zmm1.  */
> > +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > +       vandnps %zmm0, %zmm4, %zmm1
> > +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> >
> > -       xorl    %eax, %eax
> > -       # LOE rbx r12 r13 r14 r15 eax edx
> > +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> >
> > -       vzeroupper
> > -       movq    %r12, 16(%rsp)
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -       movl    %eax, %r12d
> > -       movq    %r13, 8(%rsp)
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -       movl    %edx, %r13d
> > -       movq    %r14, (%rsp)
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       /* Range mask
> > -        * bits check
> > -        */
> > +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > +
> > +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> >
> > -L(RANGEMASK_CHECK):
> > -       btl     %r12d, %r13d
> > +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> >
> > -       /* Call scalar math function */
> > -       jc      L(SCALAR_MATH_CALL)
> > -       # LOE rbx r15 r12d r13d
> > +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> >
> > -       /* Special inputs
> > -        * processing loop
> > +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > +
> > +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > +
> > +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > +
> > +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > +
> > +       kmovw   %k1, %edx
> > +       testl   %edx, %edx
> > +
> > +       /* Go to special inputs processing branch.  */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > +       /* Wait until after branch of write over zmm0.  */
> > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > +
> > +       /* No stack restoration on the fastpath.  */
> > +       ret
> > +
> > +       /* Cold case. edx has 1s where there was a special value that
> > +          needs to be handled by a tanhf call. Optimize for code size
> > +          moreso than speed here. */
> > +L(SPECIAL_VALUES_BRANCH):
> > +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > +       callee save register saving code size. */
> > +       pushq   %r13
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(r13, -16)
> > +       /* Need to callee save registers to preserve state across tanhf calls.
> >          */
> > +       pushq   %rbx
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbx, -24)
> > +       pushq   %rbp
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbp, -32)
> > +       movq    %rsp, %r13
> > +       cfi_def_cfa_register(r13)
> > +
> > +       /* Align stack and make room for 2x zmm vectors.  */
> > +       andq    $-64, %rsp
> > +       addq    $-128, %rsp
> > +
> > +       /* Save origional input (zmm0 unchanged up to this point).  */
> > +       vmovaps %zmm0, 64(%rsp)
> > +       /* Save all already computed inputs.  */
> > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > +       vmovaps %zmm0, (%rsp)
> >
> > +       vzeroupper
> > +
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a tanhf call.  */
> > +       movl    %edx, %ebx
> >  L(SPECIAL_VALUES_LOOP):
> > -       incl    %r12d
> > -       cmpl    $16, %r12d
> > -
> > -       /* Check bits in range mask */
> > -       jl      L(RANGEMASK_CHECK)
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       movq    16(%rsp), %r12
> > -       cfi_restore(12)
> > -       movq    8(%rsp), %r13
> > -       cfi_restore(13)
> > -       movq    (%rsp), %r14
> > -       cfi_restore(14)
> > -       vmovups 128(%rsp), %zmm0
> > -
> > -       /* Go to exit */
> > -       jmp     L(EXIT)
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r12 r13 r14 r15 zmm0
> > -
> > -       /* Scalar math fucntion call
> > -        * to process special input
> > -        */
> > +       # LOE rbx rbp r12 r13 r14 r15
> > +       /* use rbp as index for special value that is saved across calls to
> > +          tanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop. Realigning also costs more code size.  */
> > +       xorl    %ebp, %ebp
> > +       tzcntl  %ebx, %ebp
> >
> > -L(SCALAR_MATH_CALL):
> > -       movl    %r12d, %r14d
> > -       movss   64(%rsp, %r14, 4), %xmm0
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   64(%rsp, %rbp, 4), %xmm0
> >         call    tanhf@PLT
> > -       # LOE rbx r14 r15 r12d r13d xmm0
> >
> > -       movss   %xmm0, 128(%rsp, %r14, 4)
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, (%rsp, %rbp, 4)
> >
> > -       /* Process special inputs in loop */
> > -       jmp     L(SPECIAL_VALUES_LOOP)
> > -       # LOE rbx r15 r12d r13d
> > +       blsrl   %ebx, %ebx
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +       # LOE r12 r13 r14 r15
> > +
> > +       /* All results have been written to 64(%rsp).  */
>                                                               Should
> 64 be removed?


Correct. Fixed in V2.
> > +       vmovaps (%rsp), %zmm0
> > +       /* Restore rsp.  */
> > +       movq    %r13, %rsp
> > +       cfi_def_cfa_register(rsp)
> > +       /* Restore callee save registers.  */
> > +       popq    %rbp
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %rbx
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %r13
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(r13)
> > +       ret
> >  END(_ZGVeN16v_tanhf_skx)
> >
> >         .section .rodata, "a"
> > -       .align  64
> > -
> > +       .align  16
> >  #ifdef __svml_stanh_data_internal_typedef
> >  typedef unsigned int VUINT32;
> > -typedef struct {
> > -       __declspec(align(64)) VUINT32 _sC[32][1];
> > -       __declspec(align(64)) VUINT32 _sP0[32][1];
> > -       __declspec(align(64)) VUINT32 _sP2[32][1];
> > -       __declspec(align(64)) VUINT32 _sP3[32][1];
> > -       __declspec(align(64)) VUINT32 _sP4[32][1];
> > -       __declspec(align(64)) VUINT32 _sP5[32][1];
> > -       __declspec(align(64)) VUINT32 _sP6[32][1];
> > -       __declspec(align(64)) VUINT32 _sP7[32][1];
> > -       __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> > -       __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> > -       __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> > +typedef struct
> > +       {
> > +       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > +       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > +       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > +       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > +       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > +       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> >         __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > -       __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> > -       __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> > -       __declspec(align(64)) VUINT32 _iExpMask[16][1];
> > -       __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> > -       __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> > +       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > +       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > +       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > +       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > +       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > +       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > +       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> >  } __svml_stanh_data_internal;
> >  #endif
> > +
> >  __svml_stanh_data_internal:
> > -       /* _sC */
> > +       .align  4
> > +       /* _iExpMantMask_UISA */
> > +       .long   0x7fe00000
> > +
> > +       .align  4
> > +       /* _iMinIdxOfsMask_UISA */
> > +       .long   0x3d400000
> > +
> > +       .align  4
> > +       /* _iMaxIdxMask_UISA */
> > +       .long   0x03e00000
> > +
> > +       .align  4
> > +       /* _iExpMask */
> > +       .long   0x7f000000
> > +
> > +       .align  64
> > +__svml_stanh_data_internal_al64:
> > +       .align  64
> > +       /* _sC_lo */
> >         .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> >         .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> >         .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> >         .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > +
> > +       .align  64
> > +       /* _sC_hi */
> >         .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> >         .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> >         .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> >         .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > -       /* p0 */
> > -       .align  64
> > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > -       /* p2 */
> > -       .align  64
> > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > -       /* p3 */
> > +
> >         .align  64
> > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > -       /* p4 */
> > +       /* _sP7_lo */
> > +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > +
> >         .align  64
> > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > -       /* p5 */
> > +       /* _sP7_hi */
> > +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > +
> >         .align  64
> > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > -       /* p6 */
> > +       /* _sSignMask */
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +
> >         .align  64
> > +       /* _sP6_lo */
> >         .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> >         .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> >         .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> >         .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > +
> > +       .align  64
> > +       /* _sP6_hi */
> >         .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> >         .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> >         .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> >         .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > -       /* p7 */
> > +
> >         .align  64
> > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > +       /* _sP5_lo */
> > +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > +
> >         .align  64
> > -       .long   0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMantMask_UISA */
> > +       /* _sP5_hi */
> > +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > +
> >         .align  64
> > -       .long   0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000 /* _iMinIdxOfsMask_UISA */
> > +       /* _sP4_lo */
> > +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > +
> >         .align  64
> > -       .long   0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000 /* _iMaxIdxMask_UISA */
> > +       /* _sP4_hi */
> > +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > +
> >         .align  64
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> > +       /* _sP3_lo */
> > +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > +
> >         .align  64
> > -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> > +       /* _sP3_hi */
> > +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > +
> >         .align  64
> > -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> > +       /* _sP2_lo */
> > +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > +
> >         .align  64
> > -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> > +       /* _sP2_hi */
> > +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > +
> >         .align  64
> > -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> > +       /* _sP0_lo */
> > +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > +
> >         .align  64
> > -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> > +       /* _sP0_hi */
> > +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > +
> >         .align  64
> > +       .type   __svml_stanh_data_internal_al64, @object
> > +       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> >         .type   __svml_stanh_data_internal, @object
> >         .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-09 16:56     ` Noah Goldstein
@ 2022-06-09 16:57       ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 16:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:56 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Jun 9, 2022 at 9:00 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Jun 8, 2022 at 5:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Improvementss are:
> > >     1. Reduce code size (-64 bytes).
> > >     2. Remove redundant move instructions.
> > >     3. Slightly improve instruction selection/scheduling where
> > >        possible.
> > >     4. Reduce rodata size ([-128, -188] bytes).
> > >
> > > The throughput improvement is not significant as the port 0 bottleneck
> > > is unavoidable.
> > >         Function, New Time, Old Time, New / Old
> > > _ZGVeN16v_atanhf,     1.39,    1.408,     0.987
> > > ---
> > >  .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
> > >  1 file changed, 244 insertions(+), 230 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > index a1cd920a0f..3d808ac2bd 100644
> > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > @@ -31,53 +31,50 @@
> > >   *
> > >   */
> > >
> > > -/* Offsets for data table __svml_satanh_data_internal_avx512
> > > - */
> > > -#define Log_tbl_H                      0
> > > -#define Log_tbl_L                      128
> > > -#define One                            256
> > > -#define AbsMask                                320
> > > -#define AddB5                          384
> > > -#define RcpBitMask                     448
> > > -#define poly_coeff3                    512
> > > -#define poly_coeff2                    576
> > > -#define poly_coeff1                    640
> > > -#define poly_coeff0                    704
> > > -#define Half                           768
> > > -#define L2H                            832
> > > -#define L2L                            896
> > > +/* Offsets for data table __svml_satanh_data_internal_avx512 and
> > > +   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
> > > +   function. On cold-starts this might help the prefetcher. Possibly
> > > +   a better idea is to interleave start/end so that the prefetcher is
> > > +   less likely to detect a stream and pull irrelivant lines into
> > > +   cache.  */
> > > +
> > > +/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
> > > +   the memory is broadcast to {1to16}.  */
> > > +#define AbsMask                                0
> > > +
> > > +/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
> > > +   is used here.  */
> > > +#define One                            0
> > > +#define AddB5                          64
> > > +#define RcpBitMask                     128
> > > +#define Log_tbl_L_lo                   192
> > > +#define Log_tbl_L_hi                   256
> > > +#define Log_tbl_H_lo                   320
> > > +#define Log_tbl_H_hi                   384
> > > +#define L2H                            448
> > > +#define L2L                            512
> > > +#define poly_coeff3                    576
> > > +#define poly_coeff2                    640
> > > +#define poly_coeff1                    704
> > >
> > >  #include <sysdep.h>
> > >
> > > +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal_avx512_al64)
> > > +
> > >         .section .text.exex512, "ax", @progbits
> > >  ENTRY(_ZGVeN16v_atanhf_skx)
> > > -       pushq   %rbp
> > > -       cfi_def_cfa_offset(16)
> > > -       movq    %rsp, %rbp
> > > -       cfi_def_cfa(6, 16)
> > > -       cfi_offset(6, -16)
> > > -       andq    $-64, %rsp
> > > -       subq    $192, %rsp
> > > -       vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> > > -
> > > -       /* round reciprocals to 1+5b mantissas */
> > > -       vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> > > -       vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> > > -       vmovaps %zmm0, %zmm11
> > > -       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > > +       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
> > > +       vmovups ATANHF_DATA(One)(%rip), %zmm4
> > >
> > >         /* 1+y */
> > >         vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
> > >
> > >         /* 1-y */
> > >         vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> > > -       vxorps  %zmm6, %zmm11, %zmm10
> > > -
> > > -       /* Yp_high */
> > > -       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> > >
> > > -       /* -Ym_high */
> > > -       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> > > +       /* round reciprocals to 1+5b mantissas */
> > > +       vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
> > > +       vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
> > >
> > >         /* RcpP ~ 1/Yp */
> > >         vrcp14ps %zmm9, %zmm12
> > > @@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
> > >         /* RcpM ~ 1/Ym */
> > >         vrcp14ps %zmm8, %zmm13
> > >
> > > +       /* Yp_high */
> > > +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> > > +
> > > +       /* -Ym_high */
> > > +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> > > +
> > > +
> > >         /* input outside (-1, 1) ? */
> > > -       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> > >         vpaddd  %zmm14, %zmm12, %zmm15
> > > -       vpaddd  %zmm14, %zmm13, %zmm0
> > > +       vpaddd  %zmm14, %zmm13, %zmm12
> > >
> > >         /* Yp_low */
> > >         vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
> > >         vandps  %zmm1, %zmm15, %zmm7
> > > -       vandps  %zmm1, %zmm0, %zmm12
> > > +       vandps  %zmm1, %zmm12, %zmm12
> > >
> > >         /* Ym_low */
> > >         vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> > > @@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
> > >         vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> > >
> > >         /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> > > -       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > > -       vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> > > -       vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > > +       vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
> > > +
> > > +       vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
> > > +       vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
> > >
> > >         /* exponents */
> > > -       vgetexpps {sae}, %zmm7, %zmm15
> > >         vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > > +       vgetexpps {sae}, %zmm7, %zmm15
> > > +
> > >
> > >         /* Table lookups */
> > > -       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> > > +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
> > >         vgetexpps {sae}, %zmm12, %zmm14
> > > -       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > > +
> > >
> > >         /* Prepare table index */
> > >         vpsrld  $18, %zmm7, %zmm3
> > >         vpsrld  $18, %zmm12, %zmm2
> > > -       vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > > -       vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> > > -
> > > +       vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
> > > +       vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
> > >         /* Km-Kp */
> > > +
> > > +       vmovaps %zmm3, %zmm5
> > > +       vpermi2ps %zmm13, %zmm10, %zmm3
> > > +       vpermt2ps %zmm13, %zmm2, %zmm10
> > > +       vpermi2ps %zmm7, %zmm11, %zmm5
> > > +       vpermt2ps %zmm7, %zmm2, %zmm11
> > >         vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> > > -       kmovw   %k0, %edx
> > > -       vmovaps %zmm3, %zmm0
> > > -       vpermi2ps %zmm13, %zmm8, %zmm3
> > > -       vpermt2ps %zmm13, %zmm2, %zmm8
> > > -       vpermi2ps %zmm7, %zmm6, %zmm0
> > > -       vpermt2ps %zmm7, %zmm2, %zmm6
> > > -       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> > > +       vsubps  {rn-sae}, %zmm3, %zmm10, %zmm7
> > >
> > >         /* K*L2H + Th */
> > > -       vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> > > +       vmovups ATANHF_DATA(L2H)(%rip), %zmm2
> > >
> > >         /* K*L2L + Tl */
> > > -       vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > > -
> > > -       /* polynomials */
> > > -       vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > > -       vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > > +       vmovups ATANHF_DATA(L2L)(%rip), %zmm3
> > >
> > >         /* table values */
> > > -       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> > > -       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > > -       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > > -       vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > > -       vmovaps %zmm3, %zmm2
> > > -       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > > -       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > > -       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > > -       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > > -       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > > -       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > > +       vsubps  {rn-sae}, %zmm5, %zmm11, %zmm5
> > > +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
> > > +       vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
> > > +       /* polynomials */
> > > +       vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
> > > +       vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
> > > +       vmovaps %zmm10, %zmm14
> > > +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
> > > +       vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
> > > +       vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
> > > +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
> > > +       vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
> > > +       vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
> > > +       vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
> > >
> > >         /* (K*L2L + Tl) + Rp*PolyP */
> > > -       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > > -       vorps   Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > > +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
> > > +
> > > +       /* zmm12 = zmm12 & (zmm4 | zmm0).  */
> > > +       vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
> > >
> > >         /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> > > -       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > > -       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> > > -       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> > > +       vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
> > > +       vaddps  {rn-sae}, %zmm14, %zmm10, %zmm8
> > > +
> > > +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> > > +       kmovw   %k0, %edx
> > >         testl   %edx, %edx
> > >
> > >         /* Go to special inputs processing branch */
> > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> > > +       # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
> > > +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm0
> > >
> > > -       /* Restore registers
> > > -        * and exit the function
> > > -        */
> > > -
> > > -L(EXIT):
> > > -       movq    %rbp, %rsp
> > > -       popq    %rbp
> > > -       cfi_def_cfa(7, 8)
> > > -       cfi_restore(6)
> > > +       /* No register to restore on fast path.  */
> > >         ret
> > > -       cfi_def_cfa(6, 16)
> > > -       cfi_offset(6, -16)
> > > -
> > > -       /* Branch to process
> > > -        * special inputs
> > > -        */
> > >
> > > +       /* Cold case. edx has 1s where there was a special value that
> > > +          needs to be handled by a atanhf call. Optimize for code size
> > > +          moreso than speed here. */
> > >  L(SPECIAL_VALUES_BRANCH):
> > > -       vmovups %zmm11, 64(%rsp)
> > > -       vmovups %zmm0, 128(%rsp)
> > > -       # LOE rbx r12 r13 r14 r15 edx zmm0
> > > -
> > > -       xorl    %eax, %eax
> > > -       # LOE rbx r12 r13 r14 r15 eax edx
> > > -
> > > -       vzeroupper
> > > -       movq    %r12, 16(%rsp)
> > > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > -       movl    %eax, %r12d
> > > -       movq    %r13, 8(%rsp)
> > > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > -       movl    %edx, %r13d
> > > -       movq    %r14, (%rsp)
> > > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > -       # LOE rbx r15 r12d r13d
> > > -
> > > -       /* Range mask
> > > -        * bits check
> > > +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
> > > +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > +       callee save register saving code size. */
> > > +       pushq   %r13
> > > +       cfi_adjust_cfa_offset(8)
> > > +       cfi_offset(r13, -16)
> > > +       /* Need to callee save registers to preserve state across tanhf calls.
> > >          */
> > > +       pushq   %rbx
> > > +       cfi_adjust_cfa_offset(8)
> > > +       cfi_offset(rbx, -24)
> > > +       pushq   %rbp
> > > +       cfi_adjust_cfa_offset(8)
> > > +       cfi_offset(rbp, -32)
> > > +       movq    %rsp, %r13
> > > +       cfi_def_cfa_register(r13)
> > >
> > > -L(RANGEMASK_CHECK):
> > > -       btl     %r12d, %r13d
> > > -
> > > -       /* Call scalar math function */
> > > -       jc      L(SCALAR_MATH_CALL)
> > > -       # LOE rbx r15 r12d r13d
> > > -
> > > -       /* Special inputs
> > > -        * processing loop
> > > -        */
> > > +       /* Align stack and make room for 2x zmm vectors.  */
> > > +       andq    $-64, %rsp
> > > +       addq    $-128, %rsp
> > > +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm1
> > > +       vmovaps %zmm1, (%rsp)
> > > +       vmovaps %zmm0, 64(%rsp)
> > > +       vzeroupper
> > >
> > > +       /* edx has 1s where there was a special value that needs to be handled
> > > +          by a atanhf call.  */
> > > +       movl    %edx, %ebx
> > >  L(SPECIAL_VALUES_LOOP):
> > > -       incl    %r12d
> > > -       cmpl    $16, %r12d
> > > -
> > > -       /* Check bits in range mask */
> > > -       jl      L(RANGEMASK_CHECK)
> > > -       # LOE rbx r15 r12d r13d
> > > -
> > > -       movq    16(%rsp), %r12
> > > -       cfi_restore(12)
> > > -       movq    8(%rsp), %r13
> > > -       cfi_restore(13)
> > > -       movq    (%rsp), %r14
> > > -       cfi_restore(14)
> > > -       vmovups 128(%rsp), %zmm0
> > > -
> > > -       /* Go to exit */
> > > -       jmp     L(EXIT)
> > > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > -       # LOE rbx r12 r13 r14 r15 zmm0
> > > -
> > > -       /* Scalar math fucntion call
> > > -        * to process special input
> > > -        */
> > > -
> > > -L(SCALAR_MATH_CALL):
> > > -       movl    %r12d, %r14d
> > > -       movss   64(%rsp, %r14, 4), %xmm0
> > > +       # LOE rbx rbp r12 r13 r14 r15
> > > +       /* use rbp as index for special value that is saved across calls to
> > > +          atanhf. We technically don't need a callee save register here as offset
> > > +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > +          in the loop. Realigning also costs more code size.  */
> > > +       xorl    %ebp, %ebp
> > > +       tzcntl  %ebx, %ebp
> > > +
> > > +       /* Scalar math fucntion call to process special input.  */
> > > +       movss   64(%rsp, %rbp, 4), %xmm0
> > >         call    atanhf@PLT
> > > -       # LOE rbx r14 r15 r12d r13d xmm0
> > > -
> > > -       movss   %xmm0, 128(%rsp, %r14, 4)
> > >
> > > -       /* Process special inputs in loop */
> > > -       jmp     L(SPECIAL_VALUES_LOOP)
> > > -       # LOE rbx r15 r12d r13d
> > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > +          serialized stack/callee save restoration.  */
> > > +       movss   %xmm0, (%rsp, %rbp, 4)
> > > +
> > > +       blsrl   %ebx, %ebx
> > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > +       # LOE r12 r13 r14 r15
> > > +
> > > +       /* All results have been written to 64(%rsp).  */
> >
> > The return value is loaded from (%rsp).   Should all results be
> > written to (%rsp)?
>
> Correct. Fixed in V2.

Did you mean V3?

> >
> > > +       vmovaps (%rsp), %zmm0
> > > +       /* Restore rsp.  */
> > > +       movq    %r13, %rsp
> > > +       cfi_def_cfa_register(rsp)
> > > +       /* Restore callee save registers.  */
> > > +       popq    %rbp
> > > +       cfi_adjust_cfa_offset(-8)
> > > +       cfi_restore(rbp)
> > > +       popq    %rbx
> > > +       cfi_adjust_cfa_offset(-8)
> > > +       cfi_restore(rbp)
> > > +       popq    %r13
> > > +       cfi_adjust_cfa_offset(-8)
> > > +       cfi_restore(r13)
> > > +       ret
> > >  END(_ZGVeN16v_atanhf_skx)
> > >
> > >         .section .rodata, "a"
> > > -       .align  64
> > > -
> > > +       .align  4
> > >  #ifdef __svml_satanh_data_internal_avx512_typedef
> > >  typedef unsigned int VUINT32;
> > > -typedef struct {
> > > -       __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> > > -       __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> > > +typedef struct{
> > > +       __declspec(align(4)) VUINT32 AbsMask[1][1];
> > >         __declspec(align(64)) VUINT32 One[16][1];
> > > -       __declspec(align(64)) VUINT32 AbsMask[16][1];
> > >         __declspec(align(64)) VUINT32 AddB5[16][1];
> > >         __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> > > +       __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
> > > +       __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
> > > +       __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
> > > +       __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
> > > +       __declspec(align(64)) VUINT32 L2H[16][1];
> > > +       __declspec(align(64)) VUINT32 L2L[16][1];
> > >         __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> > >         __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> > >         __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> > > -       __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> > > -       __declspec(align(64)) VUINT32 Half[16][1];
> > > -       __declspec(align(64)) VUINT32 L2H[16][1];
> > > -       __declspec(align(64)) VUINT32 L2L[16][1];
> > >  } __svml_satanh_data_internal_avx512;
> > >  #endif
> > >  __svml_satanh_data_internal_avx512:
> > > -       /* Log_tbl_H */
> > > -       .long   0x00000000
> > > -       .long   0x3cfc0000
> > > -       .long   0x3d780000
> > > -       .long   0x3db78000
> > > -       .long   0x3df10000
> > > -       .long   0x3e14c000
> > > -       .long   0x3e300000
> > > -       .long   0x3e4a8000
> > > -       .long   0x3e648000
> > > -       .long   0x3e7dc000
> > > -       .long   0x3e8b4000
> > > -       .long   0x3e974000
> > > -       .long   0x3ea30000
> > > -       .long   0x3eae8000
> > > -       .long   0x3eb9c000
> > > -       .long   0x3ec4e000
> > > -       .long   0x3ecfa000
> > > -       .long   0x3eda2000
> > > -       .long   0x3ee48000
> > > -       .long   0x3eeea000
> > > -       .long   0x3ef8a000
> > > -       .long   0x3f013000
> > > -       .long   0x3f05f000
> > > -       .long   0x3f0aa000
> > > -       .long   0x3f0f4000
> > > -       .long   0x3f13d000
> > > -       .long   0x3f184000
> > > -       .long   0x3f1ca000
> > > -       .long   0x3f20f000
> > > -       .long   0x3f252000
> > > -       .long   0x3f295000
> > > -       .long   0x3f2d7000
> > > -       /* Log_tbl_L */
> > > +       /* Leave this at front so we can potentially save space due to
> > > +          smaller alignment constraint.  */
> > > +       .align  4
> > > +    /* AbsMask */
> > > +       .long   0x7fffffff
> > > +       .align  64
> > > +__svml_satanh_data_internal_avx512_al64:
> > > +       /* One */
> > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > +       /* AddB5 */
> > > +       .align  64
> > > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > +       /* RcpBitMask */
> > > +       .align  64
> > > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > +       /* Log_tbl_L_lo */
> > >         .align  64
> > >         .long   0x00000000
> > >         .long   0x3726c39e
> > > @@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
> > >         .long   0x38dedfac
> > >         .long   0x38ebfb5e
> > >         .long   0xb8e63c9f
> > > +       /* Log_tbl_L_hi */
> > > +       .align  64
> > >         .long   0xb85c1340
> > >         .long   0x38777bcd
> > >         .long   0xb6038656
> > > @@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
> > >         .long   0x38f85db0
> > >         .long   0x37b4996f
> > >         .long   0xb8bfb3ca
> > > -       /* One */
> > > +       /* Log_tbl_H_lo */
> > >         .align  64
> > > -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > -       /* AbsMask */
> > > +       .long   0x00000000
> > > +       .long   0x3cfc0000
> > > +       .long   0x3d780000
> > > +       .long   0x3db78000
> > > +       .long   0x3df10000
> > > +       .long   0x3e14c000
> > > +       .long   0x3e300000
> > > +       .long   0x3e4a8000
> > > +       .long   0x3e648000
> > > +       .long   0x3e7dc000
> > > +       .long   0x3e8b4000
> > > +       .long   0x3e974000
> > > +       .long   0x3ea30000
> > > +       .long   0x3eae8000
> > > +       .long   0x3eb9c000
> > > +       .long   0x3ec4e000
> > > +       /* Log_tbl_H_hi */
> > >         .align  64
> > > -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > > -       /* AddB5 */
> > > +       .long   0x3ecfa000
> > > +       .long   0x3eda2000
> > > +       .long   0x3ee48000
> > > +       .long   0x3eeea000
> > > +       .long   0x3ef8a000
> > > +       .long   0x3f013000
> > > +       .long   0x3f05f000
> > > +       .long   0x3f0aa000
> > > +       .long   0x3f0f4000
> > > +       .long   0x3f13d000
> > > +       .long   0x3f184000
> > > +       .long   0x3f1ca000
> > > +       .long   0x3f20f000
> > > +       .long   0x3f252000
> > > +       .long   0x3f295000
> > > +       .long   0x3f2d7000
> > > +       /* L2H = log(2)_high */
> > >         .align  64
> > > -       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > -       /* RcpBitMask */
> > > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > +       /* L2L = log(2)_low */
> > >         .align  64
> > > -       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > >         /* poly_coeff3 */
> > >         .align  64
> > > -       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > >         /* poly_coeff2 */
> > >         .align  64
> > > -       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > >         /* poly_coeff1 */
> > >         .align  64
> > > -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > -       /* poly_coeff0 */
> > > -       .align  64
> > > -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > -       /* Half */
> > > -       .align  64
> > > -       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > > -       /* L2H = log(2)_high */
> > > -       .align  64
> > > -       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > -       /* L2L = log(2)_low */
> > > -       .align  64
> > > -       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > >         .align  64
> > > +       .type   __svml_satanh_data_internal_avx512_al64, @object
> > > +       .size   __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
> > >         .type   __svml_satanh_data_internal_avx512, @object
> > >         .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S
  2022-06-09 16:10     ` H.J. Lu
@ 2022-06-09 16:58       ` Noah Goldstein
  0 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:11 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Jun 8, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Optimizations are:
> >     1. Reduce code size (-81 bytes).
> >     2. Remove redundant move instructions.
> >     3. Slightly improve instruction selection/scheduling where
> >        possible.
> >     4. Prefer registers which get short instruction encoding.
> >     5. Reduce rodata size (-32 bytes).
> >
> > Result is roughly a 17-18% speedup:
> >
> >        Function, New Time, Old Time, New / Old
> > _ZGVdN8v_tanhf,     1.977,    2.402,     0.823
> > ---
> >  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 912 ++++--------------
> >  1 file changed, 171 insertions(+), 741 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > index c5c87bf5b0..a47ede0501 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > @@ -70,773 +70,203 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal
> > - */
> > -#define _dbP                           0
> > -#define _sSignMask                     4288
> > -#define _sAbsMask                      4320
> > -#define _iExpMantMask                  4352
> > -#define _iExpMask                      4384
> > -#define _iMinIdxOfsMask                        4416
> > -#define _iMaxIdxMask                   4448
> > -
> >  #include <sysdep.h>
> >
> > +/* tanhf data tables for avx2 and sse4 implementatins defined here.
> > + */
> > +#include "svml_s_tanhf_rodata.S"
> > +
> >         .section .text.avx2, "ax", @progbits
> >  ENTRY(_ZGVdN8v_tanhf_avx2)
> > -       pushq   %rbp
> > -       cfi_def_cfa_offset(16)
> > -       movq    %rsp, %rbp
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> > -       andq    $-32, %rsp
> > -       pushq   %r12
> > -       subq    $120, %rsp
> > -       lea     _dbP+16+__svml_stanh_data_internal(%rip), %r10
> > -       vmovaps %ymm0, %ymm12
> > -
> >         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -       vpand   _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> > +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
> > +
> > +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> > +       vpxor   %ymm3, %ymm3, %ymm3
> > +       vpmaxsd %ymm3, %ymm2, %ymm2
> > +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
> >
> >         /*
> >          *  small table specific variables *
> >          *  Constant loading
> >          */
> > -       vmovups _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> > -       vpsubd  _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> > -
> > -       /* if VMIN, VMAX is defined for I type */
> > -       vxorps  %ymm15, %ymm15, %ymm15
> > -       vpcmpgtd %ymm15, %ymm9, %ymm0
> > -       vpand   %ymm0, %ymm9, %ymm7
> > -       vpcmpgtd %ymm8, %ymm9, %ymm6
> > -       vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> > -       vpsrld  $14, %ymm3, %ymm1
> > -       vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> > -       vmovmskps %ymm13, %r11d
> > -       vandps  _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> > -       vandps  _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> > -       vextractf128 $1, %ymm1, %xmm2
> > -       vmovd   %xmm1, %r9d
> > -       vmovd   %xmm2, %ecx
> > -       vpextrd $1, %xmm2, %edx
> > -       vpextrd $1, %xmm1, %r8d
> > -       movslq  %r9d, %r9
> > -       movslq  %edx, %rdx
> > -       movslq  %r8d, %r8
> > -       vpextrd $2, %xmm1, %edi
> > -       movslq  %ecx, %rcx
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > -       vpextrd $3, %xmm2, %r12d
> > -       vpextrd $3, %xmm1, %esi
> > -       vpextrd $2, %xmm2, %eax
> > -       movslq  %edi, %rdi
> > -       movslq  %r12d, %r12
> > -       movslq  %esi, %rsi
> > -       movslq  %eax, %rax
> > -       vmovupd -16(%r9, %r10), %xmm5
> > -       vmovupd -16(%rdx, %r10), %xmm14
> > -       vmovupd -16(%rcx, %r10), %xmm13
> > -       vmovupd (%r9, %r10), %xmm1
> > -       vmovupd (%r8, %r10), %xmm2
> > -       vmovupd -16(%r8, %r10), %xmm4
> > -       vinsertf128 $1, -16(%rdi, %r10), %ymm5, %ymm15
> > -       vinsertf128 $1, -16(%r12, %r10), %ymm14, %ymm3
> > -       vinsertf128 $1, -16(%rax, %r10), %ymm13, %ymm6
> > -       vinsertf128 $1, (%rdi, %r10), %ymm1, %ymm5
> > -       vinsertf128 $1, (%rsi, %r10), %ymm2, %ymm14
> > -       vunpcklpd %ymm3, %ymm6, %ymm8
> > +       vpsrld  $14, %ymm2, %ymm1
> > +
> > +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> > +          store/load as we can take advantage of store-forwarding.  */
> > +       vmovq   %xmm1, %r8
> > +       /* We have eliminated all negative values for ymm1 so no need to sign
> > +          extend.  */
> > +       movl    %r8d, %r9d
> > +       shrq    $32, %r8
> > +
> > +       /* Store base of lookup table in rax.  */
> > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > +
> > +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> > +          with memory operand. This helps alleviate bottleneck on p5.  */
> > +       vmovupd 16(%r9, %rax), %xmm5
> > +
> > +       vpextrq $1, %xmm1, %rsi
> > +       movl    %esi, %edi
> > +       shrq    $32, %rsi
> > +
> > +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
> > +
> > +       vextracti128 $1, %ymm1, %xmm2
> > +       vmovq   %xmm2, %rdx
> > +       movl    %edx, %ecx
> > +       shrq    $32, %rdx
> > +
> > +       vmovupd (%rcx, %rax), %xmm6
> > +
> > +       vpextrq $1, %xmm2, %r10
> > +       movl    %r10d, %r11d
> > +       shrq    $32, %r10
> > +
> > +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> > +
> > +       vmovupd 16(%r8, %rax), %xmm1
> > +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> > +       vmovupd (%rdx, %rax), %xmm3
> > +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> > +
> > +       vunpcklpd %ymm3, %ymm6, %ymm7
> >         vunpckhpd %ymm3, %ymm6, %ymm6
> > -       vunpcklpd %ymm14, %ymm5, %ymm3
> > -       vunpckhpd %ymm14, %ymm5, %ymm2
> > -       vmovupd (%rcx, %r10), %xmm13
> > -       vcvtps2pd %xmm10, %ymm5
> > -       vextractf128 $1, %ymm10, %xmm10
> > -       vfmadd213pd %ymm3, %ymm5, %ymm2
> > -       vinsertf128 $1, -16(%rsi, %r10), %ymm4, %ymm0
> > -       vmovupd (%rdx, %r10), %xmm4
> > -       vunpcklpd %ymm0, %ymm15, %ymm9
> > -       vunpckhpd %ymm0, %ymm15, %ymm7
> > -       vfmadd213pd %ymm7, %ymm5, %ymm2
> > -       vfmadd213pd %ymm9, %ymm5, %ymm2
> > -       vinsertf128 $1, (%r12, %r10), %ymm4, %ymm0
> > -       vcvtps2pd %xmm10, %ymm4
> > -       vinsertf128 $1, (%rax, %r10), %ymm13, %ymm15
> > -       vunpcklpd %ymm0, %ymm15, %ymm1
> > -       vunpckhpd %ymm0, %ymm15, %ymm0
> > -       vfmadd213pd %ymm1, %ymm4, %ymm0
> > -       vcvtpd2ps %ymm2, %xmm1
> > -       vfmadd213pd %ymm6, %ymm4, %ymm0
> > -       vfmadd213pd %ymm8, %ymm4, %ymm0
> > -       vcvtpd2ps %ymm0, %xmm0
> > -       vinsertf128 $1, %xmm0, %ymm1, %ymm2
> > -       vorps   %ymm11, %ymm2, %ymm0
> > -       testl   %r11d, %r11d
> >
> > -       /* Go to special inputs processing branch */
> > -       jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> > +       vunpcklpd %ymm1, %ymm5, %ymm3
> > +       vunpckhpd %ymm1, %ymm5, %ymm1
> >
> > -       /* Restore registers
> > -        * and exit the function
> > -        */
> > +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> > +       /* Store special cases in ymm15.  */
> > +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
> >
> > -L(EXIT):
> > -       addq    $120, %rsp
> > -       cfi_restore(12)
> > -       popq    %r12
> > -       movq    %rbp, %rsp
> > -       popq    %rbp
> > -       cfi_def_cfa(7, 8)
> > -       cfi_restore(6)
> > -       ret
> > -       cfi_def_cfa(6, 16)
> > -       cfi_offset(6, -16)
> > -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > +       vandps  %ymm11, %ymm0, %ymm4
> >
> > -       /* Branch to process
> > -        * special inputs
> > -        */
> > +       vcvtps2pd %xmm4, %ymm5
> >
> > -L(SPECIAL_VALUES_BRANCH):
> > -       vmovups %ymm12, 32(%rsp)
> > -       vmovups %ymm0, 64(%rsp)
> > -       # LOE rbx r13 r14 r15 r11d ymm0
> > +       vextractf128 $1, %ymm4, %xmm4
> > +       vcvtps2pd %xmm4, %ymm4
> >
> > -       xorl    %r12d, %r12d
> > -       # LOE rbx r13 r14 r15 r11d r12d
> > +       vmovupd 16(%rcx, %rax), %xmm2
> > +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
> >
> > -       vzeroupper
> > -       movq    %r13, 8(%rsp)
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > -       movl    %r11d, %r13d
> > -       movq    %r14, (%rsp)
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       /* Range mask
> > -        * bits check
> > -        */
> > +       vfmadd213pd %ymm3, %ymm5, %ymm1
> > +
> > +       vmovupd 16(%rdx, %rax), %xmm3
> > +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> > +
> > +       vunpcklpd %ymm3, %ymm2, %ymm10
> > +       vunpckhpd %ymm3, %ymm2, %ymm2
> > +
> > +       vfmadd213pd %ymm10, %ymm4, %ymm2
> > +       vfmadd213pd %ymm6, %ymm4, %ymm2
> > +       vfmadd213pd %ymm7, %ymm4, %ymm2
> > +       vcvtpd2ps %ymm2, %xmm2
> > +
> > +       vmovupd (%r9, %rax), %xmm7
> > +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> > +
> > +       vmovupd (%r8, %rax), %xmm3
> > +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> > +
> > +       vunpckhpd %ymm3, %ymm7, %ymm4
> > +       vunpcklpd %ymm3, %ymm7, %ymm7
> >
> > -L(RANGEMASK_CHECK):
> > -       btl     %r12d, %r13d
> > +       vfmadd213pd %ymm4, %ymm5, %ymm1
> > +       vfmadd213pd %ymm7, %ymm5, %ymm1
> > +
> > +
> > +       vcvtpd2ps %ymm1, %xmm1
> > +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> > +
> > +       vmovmskps %ymm15, %edx
> > +       vandnps %ymm0, %ymm11, %ymm2
> > +       testl   %edx, %edx
> > +       /* Go to special inputs processing branch */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +       # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
> > +       /* Wait until after branch of write over ymm0.  */
> > +       vorps   %ymm2, %ymm1, %ymm0
> > +       /* No stack restoration on the fastpath.  */
> > +       ret
> >
> > -       /* Call scalar math function */
> > -       jc      L(SCALAR_MATH_CALL)
> > -       # LOE rbx r15 r12d r13d
> >
> > -       /* Special inputs
> > -        * processing loop
> > +       /* Cold case. edx has 1s where there was a special value that
> > +          needs to be handled by a tanhf call. Optimize for code size
> > +          moreso than speed here. */
>                more so

Fixed in v2 (along with other files)
> > +L(SPECIAL_VALUES_BRANCH):
> > +       # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
> > +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > +       callee save register saving code size. */
> > +       pushq   %r13
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(r13, -16)
> > +       /* Need to callee save registers to preserve state across tanhf calls.
> >          */
> > +       pushq   %rbx
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbx, -24)
> > +       pushq   %rbp
> > +       cfi_adjust_cfa_offset(8)
> > +       cfi_offset(rbp, -32)
> > +       movq    %rsp, %r13
> > +       cfi_def_cfa_register(r13)
> > +
> > +       /* Align stack and make room for 2x ymm vectors.  */
> > +       andq    $-32, %rsp
> > +       addq    $-64, %rsp
> > +
> > +       /* Save all already computed inputs.  */
> > +       vorps   %ymm2, %ymm1, %ymm1
> > +       vmovaps %ymm1, (%rsp)
> > +       /* Save origional input (ymm0 unchanged up to this point).  */
>                          original

Fixed in V2 (along with other files)
> > +       vmovaps %ymm0, 32(%rsp)
> > +
> > +       vzeroupper
> >
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a tanhf call.  */
> > +       movl    %edx, %ebx
> >  L(SPECIAL_VALUES_LOOP):
> > -       incl    %r12d
> > -       cmpl    $8, %r12d
> > -
> > -       /* Check bits in range mask */
> > -       jl      L(RANGEMASK_CHECK)
> > -       # LOE rbx r15 r12d r13d
> > -
> > -       movq    8(%rsp), %r13
> > -       cfi_restore(13)
> > -       movq    (%rsp), %r14
> > -       cfi_restore(14)
> > -       vmovups 64(%rsp), %ymm0
> > -
> > -       /* Go to exit */
> > -       jmp     L(EXIT)
> > -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > -       # LOE rbx r13 r14 r15 ymm0
> > -
> > -       /* Scalar math fucntion call
> > -        * to process special input
> > -        */
> > +       # LOE rbx rbp r12 r13 r14 r15
> > +       /* use rbp as index for special value that is saved across calls to
> > +          tanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop. Realigning also costs more code size.  */
> > +       xorl    %ebp, %ebp
> > +       tzcntl  %ebx, %ebp
> >
> > -L(SCALAR_MATH_CALL):
> > -       movl    %r12d, %r14d
> > -       movss   32(%rsp, %r14, 4), %xmm0
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   32(%rsp, %rbp, 4), %xmm0
> >         call    tanhf@PLT
> > -       # LOE rbx r14 r15 r12d r13d xmm0
> >
> > -       movss   %xmm0, 64(%rsp, %r14, 4)
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, (%rsp, %rbp, 4)
> > +
> > +       blsrl   %ebx, %ebx
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +       # LOE r12 r13 r14 r15
> >
> > -       /* Process special inputs in loop */
> > -       jmp     L(SPECIAL_VALUES_LOOP)
> > -       # LOE rbx r15 r12d r13d
> > -END(_ZGVdN8v_tanhf_avx2)
> >
> > -       .section .rodata, "a"
> > -       .align  32
> > -
> > -#ifdef __svml_stanh_data_internal_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct {
> > -       __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> > -       __declspec(align(32)) VUINT32 _sSignMask[8][1];
> > -       __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> > -       __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> > -       __declspec(align(32)) VUINT32 _iExpMask[8][1];
> > -       __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> > -       __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> > -} __svml_stanh_data_internal;
> > -#endif
> > -__svml_stanh_data_internal:
> > -       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > -       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
> > -       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
> > -       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
> > -       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
> > -       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
> > -       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
> > -       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
> > -       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
> > -       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
> > -       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
> > -       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
> > -       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
> > -       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
> > -       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
> > -       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
> > -       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
> > -       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
> > -       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
> > -       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
> > -       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
> > -       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
> > -       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
> > -       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
> > -       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
> > -       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
> > -       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
> > -       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
> > -       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
> > -       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
> > -       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
> > -       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
> > -       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
> > -       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
> > -       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
> > -       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
> > -       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
> > -       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
> > -       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
> > -       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
> > -       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
> > -       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
> > -       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
> > -       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
> > -       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
> > -       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
> > -       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
> > -       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
> > -       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
> > -       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
> > -       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
> > -       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
> > -       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
> > -       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
> > -       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
> > -       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
> > -       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
> > -       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
> > -       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
> > -       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
> > -       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
> > -       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
> > -       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
> > -       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
> > -       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
> > -       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
> > -       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
> > -       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
> > -       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
> > -       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
> > -       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
> > -       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
> > -       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
> > -       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
> > -       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
> > -       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
> > -       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
> > -       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
> > -       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
> > -       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
> > -       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
> > -       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
> > -       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
> > -       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
> > -       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
> > -       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
> > -       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
> > -       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
> > -       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
> > -       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
> > -       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
> > -       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
> > -       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
> > -       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
> > -       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
> > -       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
> > -       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
> > -       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
> > -       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
> > -       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
> > -       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
> > -       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
> > -       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
> > -       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
> > -       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
> > -       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
> > -       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
> > -       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
> > -       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
> > -       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
> > -       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
> > -       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
> > -       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
> > -       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
> > -       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
> > -       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
> > -       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
> > -       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
> > -       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
> > -       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
> > -       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
> > -       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
> > -       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
> > -       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
> > -       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
> > -       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
> > -       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
> > -       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
> > -       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
> > -       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
> > -       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
> > -       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
> > -       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
> > -       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
> > -       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
> > -       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
> > -       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
> > -       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
> > -       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
> > -       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
> > -       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
> > -       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
> > -       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
> > -       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
> > -       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
> > -       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
> > -       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
> > -       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
> > -       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
> > -       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
> > -       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
> > -       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
> > -       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
> > -       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
> > -       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
> > -       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
> > -       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
> > -       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
> > -       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
> > -       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
> > -       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
> > -       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
> > -       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
> > -       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
> > -       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
> > -       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
> > -       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
> > -       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
> > -       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
> > -       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
> > -       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
> > -       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
> > -       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
> > -       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
> > -       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
> > -       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
> > -       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
> > -       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
> > -       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
> > -       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
> > -       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
> > -       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
> > -       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
> > -       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
> > -       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
> > -       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
> > -       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
> > -       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
> > -       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
> > -       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
> > -       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
> > -       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
> > -       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
> > -       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
> > -       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
> > -       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
> > -       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
> > -       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
> > -       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
> > -       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
> > -       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
> > -       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
> > -       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
> > -       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
> > -       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
> > -       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
> > -       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
> > -       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
> > -       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
> > -       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
> > -       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
> > -       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
> > -       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
> > -       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
> > -       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
> > -       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
> > -       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
> > -       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
> > -       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
> > -       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
> > -       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
> > -       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
> > -       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
> > -       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
> > -       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
> > -       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
> > -       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
> > -       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
> > -       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
> > -       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
> > -       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
> > -       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
> > -       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
> > -       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
> > -       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
> > -       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
> > -       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
> > -       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
> > -       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
> > -       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
> > -       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
> > -       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
> > -       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
> > -       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
> > -       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
> > -       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
> > -       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
> > -       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
> > -       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
> > -       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
> > -       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
> > -       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
> > -       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
> > -       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
> > -       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
> > -       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
> > -       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
> > -       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
> > -       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
> > -       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
> > -       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
> > -       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
> > -       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
> > -       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
> > -       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
> > -       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
> > -       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
> > -       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
> > -       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
> > -       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
> > -       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
> > -       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
> > -       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
> > -       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
> > -       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
> > -       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
> > -       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
> > -       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
> > -       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
> > -       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
> > -       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
> > -       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
> > -       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
> > -       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
> > -       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
> > -       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
> > -       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
> > -       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
> > -       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
> > -       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
> > -       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
> > -       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
> > -       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
> > -       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
> > -       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
> > -       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
> > -       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
> > -       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
> > -       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
> > -       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
> > -       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
> > -       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
> > -       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
> > -       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
> > -       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
> > -       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
> > -       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
> > -       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
> > -       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
> > -       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
> > -       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
> > -       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
> > -       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
> > -       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
> > -       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
> > -       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
> > -       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
> > -       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
> > -       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
> > -       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
> > -       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
> > -       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
> > -       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
> > -       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
> > -       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
> > -       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
> > -       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
> > -       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
> > -       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
> > -       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
> > -       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
> > -       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
> > -       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
> > -       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
> > -       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
> > -       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
> > -       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
> > -       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
> > -       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
> > -       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
> > -       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
> > -       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
> > -       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
> > -       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
> > -       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
> > -       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
> > -       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
> > -       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
> > -       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
> > -       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
> > -       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
> > -       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
> > -       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
> > -       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
> > -       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
> > -       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
> > -       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
> > -       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
> > -       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
> > -       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
> > -       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
> > -       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
> > -       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
> > -       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
> > -       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
> > -       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
> > -       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
> > -       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
> > -       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
> > -       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
> > -       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
> > -       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
> > -       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
> > -       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
> > -       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
> > -       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
> > -       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
> > -       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
> > -       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
> > -       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
> > -       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
> > -       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
> > -       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
> > -       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
> > -       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
> > -       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
> > -       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
> > -       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
> > -       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
> > -       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
> > -       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
> > -       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
> > -       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
> > -       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
> > -       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
> > -       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
> > -       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
> > -       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
> > -       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
> > -       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
> > -       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
> > -       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
> > -       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
> > -       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
> > -       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
> > -       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
> > -       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
> > -       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
> > -       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
> > -       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
> > -       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
> > -       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
> > -       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
> > -       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
> > -       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
> > -       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
> > -       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
> > -       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
> > -       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
> > -       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
> > -       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
> > -       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
> > -       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
> > -       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
> > -       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
> > -       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
> > -       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
> > -       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
> > -       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
> > -       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
> > -       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
> > -       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
> > -       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
> > -       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
> > -       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
> > -       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
> > -       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
> > -       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
> > -       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
> > -       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
> > -       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
> > -       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
> > -       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
> > -       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
> > -       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
> > -       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
> > -       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
> > -       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
> > -       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
> > -       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
> > -       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
> > -       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
> > -       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
> > -       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
> > -       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
> > -       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
> > -       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
> > -       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
> > -       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
> > -       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
> > -       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
> > -       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
> > -       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
> > -       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
> > -       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
> > -       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
> > -       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
> > -       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
> > -       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
> > -       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
> > -       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
> > -       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
> > -       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
> > -       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
> > -       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
> > -       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
> > -       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
> > -       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
> > -       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
> > -       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
> > -       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
> > -       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
> > -       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
> > -       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
> > -       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
> > -       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
> > -       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
> > -       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
> > -       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
> > -       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
> > -       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
> > -       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
> > -       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
> > -       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
> > -       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
> > -       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
> > -       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
> > -       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
> > -       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
> > -       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
> > -       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
> > -       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
> > -       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
> > -       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
> > -       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
> > -       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
> > -       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
> > -       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
> > -       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
> > -       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
> > -       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
> > -       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
> > -       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
> > -       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
> > -       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
> > -       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
> > -       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
> > -       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
> > -       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
> > -       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
> > -       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
> > -       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
> > -       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
> > -       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
> > -       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
> > -       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
> > -       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
> > -       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
> > -       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
> > -       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
> > -       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
> > -       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
> > -       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
> > -       .quad   0x3ff0000000000000
> > -       .quad   0x0000000000000000
> > -       .quad   0x0000000000000000
> > -       .quad   0x0000000000000000
> > -       .align  32
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> > -       .align  32
> > -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> > -       .align  32
> > -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> > -       .align  32
> > -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> > -       .align  32
> > -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> > -       .align  32
> > -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> > -       .align  32
> > -       .type   __svml_stanh_data_internal, @object
> > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > +       /* All results have been written to 32(%rsp).  */
>                                                                 Should
> 32 be removed?


Correct. Fixed in V2.
> > +       vmovups (%rsp), %ymm0
> > +       /* Restore rsp.  */
> > +       movq    %r13, %rsp
> > +       cfi_def_cfa_register(rsp)
> > +       /* Restore callee save registers.  */
> > +       popq    %rbp
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %rbx
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(rbp)
> > +       popq    %r13
> > +       cfi_adjust_cfa_offset(-8)
> > +       cfi_restore(r13)
> > +       ret
> > +END(_ZGVdN8v_tanhf_avx2)
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
@ 2022-06-09 16:58 ` Noah Goldstein
  2022-06-09 16:58   ` [PATCH v3 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
                     ` (6 more replies)
  2022-06-09 18:16 ` [PATCH v4 " Noah Goldstein
  8 siblings, 7 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: libc-alpha

Improvementss are:
    1. Reduce code size (-64 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata size ([-128, -188] bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

        Function, New Time, Old Time, New / Old
_ZGVeN16v_atanhf,     1.39,    1.408,     0.987
---
 .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
 1 file changed, 244 insertions(+), 230 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index a1cd920a0f..f42462c581 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -31,53 +31,50 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512
- */
-#define Log_tbl_H			0
-#define Log_tbl_L			128
-#define One				256
-#define AbsMask				320
-#define AddB5				384
-#define RcpBitMask			448
-#define poly_coeff3			512
-#define poly_coeff2			576
-#define poly_coeff1			640
-#define poly_coeff0			704
-#define Half				768
-#define L2H				832
-#define L2L				896
+/* Offsets for data table __svml_satanh_data_internal_avx512 and
+   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
+   function. On cold-starts this might help the prefetcher. Possibly
+   a better idea is to interleave start/end so that the prefetcher is
+   less likely to detect a stream and pull irrelivant lines into
+   cache.  */
+
+/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
+   the memory is broadcast to {1to16}.  */
+#define AbsMask				0
+
+/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
+   is used here.  */
+#define One				0
+#define AddB5				64
+#define RcpBitMask			128
+#define Log_tbl_L_lo			192
+#define Log_tbl_L_hi			256
+#define Log_tbl_H_lo			320
+#define Log_tbl_H_hi			384
+#define L2H				448
+#define L2L				512
+#define poly_coeff3			576
+#define poly_coeff2			640
+#define poly_coeff1			704
 
 #include <sysdep.h>
 
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal_avx512_al64)
+
 	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanhf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	vmovups	One+__svml_satanh_data_internal_avx512(%rip), %zmm4
-
-	/* round reciprocals to 1+5b mantissas */
-	vmovups	AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
-	vmovups	RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
-	vmovaps	%zmm0, %zmm11
-	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
+	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
+	vmovups	ATANHF_DATA(One)(%rip), %zmm4
 
 	/* 1+y */
 	vaddps	{rn-sae}, %zmm4, %zmm6, %zmm9
 
 	/* 1-y */
 	vsubps	{rn-sae}, %zmm6, %zmm4, %zmm8
-	vxorps	%zmm6, %zmm11, %zmm10
-
-	/* Yp_high */
-	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
 
-	/* -Ym_high */
-	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+	/* round reciprocals to 1+5b mantissas */
+	vmovups	ATANHF_DATA(AddB5)(%rip), %zmm14
+	vmovups	ATANHF_DATA(RcpBitMask)(%rip), %zmm1
 
 	/* RcpP ~ 1/Yp */
 	vrcp14ps %zmm9, %zmm12
@@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	/* RcpM ~ 1/Ym */
 	vrcp14ps %zmm8, %zmm13
 
+	/* Yp_high */
+	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
+
+	/* -Ym_high */
+	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+
+
 	/* input outside (-1, 1) ? */
-	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
 	vpaddd	%zmm14, %zmm12, %zmm15
-	vpaddd	%zmm14, %zmm13, %zmm0
+	vpaddd	%zmm14, %zmm13, %zmm12
 
 	/* Yp_low */
 	vsubps	{rn-sae}, %zmm2, %zmm6, %zmm3
 	vandps	%zmm1, %zmm15, %zmm7
-	vandps	%zmm1, %zmm0, %zmm12
+	vandps	%zmm1, %zmm12, %zmm12
 
 	/* Ym_low */
 	vaddps	{rn-sae}, %zmm5, %zmm6, %zmm5
@@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
 
 	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
-	vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
-	vmovups	Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
-	vmovups	Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
+
+	vmovups	ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
+	vmovups	ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
 
 	/* exponents */
-	vgetexpps {sae}, %zmm7, %zmm15
 	vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
+	vgetexpps {sae}, %zmm7, %zmm15
+
 
 	/* Table lookups */
-	vmovups	__svml_satanh_data_internal_avx512(%rip), %zmm6
+	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
 	vgetexpps {sae}, %zmm12, %zmm14
-	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
+
 
 	/* Prepare table index */
 	vpsrld	$18, %zmm7, %zmm3
 	vpsrld	$18, %zmm12, %zmm2
-	vmovups	Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
-
+	vmovups	ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
+	vmovups	ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
 	/* Km-Kp */
+
+	vmovaps	%zmm3, %zmm5
+	vpermi2ps %zmm13, %zmm10, %zmm3
+	vpermt2ps %zmm13, %zmm2, %zmm10
+	vpermi2ps %zmm7, %zmm11, %zmm5
+	vpermt2ps %zmm7, %zmm2, %zmm11
 	vsubps	{rn-sae}, %zmm15, %zmm14, %zmm1
-	kmovw	%k0, %edx
-	vmovaps	%zmm3, %zmm0
-	vpermi2ps %zmm13, %zmm8, %zmm3
-	vpermt2ps %zmm13, %zmm2, %zmm8
-	vpermi2ps %zmm7, %zmm6, %zmm0
-	vpermt2ps %zmm7, %zmm2, %zmm6
-	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm5
+	vsubps	{rn-sae}, %zmm3, %zmm10, %zmm7
 
 	/* K*L2H + Th */
-	vmovups	L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
+	vmovups	ATANHF_DATA(L2H)(%rip), %zmm2
 
 	/* K*L2L + Tl */
-	vmovups	L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
-
-	/* polynomials */
-	vmovups	poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vmovups	ATANHF_DATA(L2L)(%rip), %zmm3
 
 	/* table values */
-	vsubps	{rn-sae}, %zmm0, %zmm6, %zmm0
-	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
-	vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
-	vmovups	poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
-	vmovaps	%zmm3, %zmm2
-	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
-	vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
-	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
-	vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
+	vsubps	{rn-sae}, %zmm5, %zmm11, %zmm5
+	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
+	vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
+	/* polynomials */
+	vmovups	ATANHF_DATA(poly_coeff3)(%rip), %zmm7
+	vmovups	ATANHF_DATA(poly_coeff2)(%rip), %zmm10
+	vmovaps	%zmm10, %zmm14
+	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
+	vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
+	vmovups	ATANHF_DATA(poly_coeff1)(%rip), %zmm12
+	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
+	vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
 
 	/* (K*L2L + Tl) + Rp*PolyP */
-	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
-	vorps	Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
+	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
+
+	/* zmm12 = zmm12 & (zmm4 | zmm0).  */
+	vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
 
 	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
-	vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
-	vaddps	{rn-sae}, %zmm3, %zmm0, %zmm4
-	vmulps	{rn-sae}, %zmm9, %zmm4, %zmm0
+	vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
+	vaddps	{rn-sae}, %zmm14, %zmm10, %zmm8
+
+	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
+	kmovw	%k0, %edx
 	testl	%edx, %edx
 
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
+	# LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm0
 
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	/* No register to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm11, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm1
+	vmovaps	%zmm1, (%rsp)
+	vmovaps	%zmm0, 64(%rsp)
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
-
-	movss	%xmm0, 128(%rsp, %r14, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
+	/* All results have been written to (%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVeN16v_atanhf_skx)
 
 	.section .rodata, "a"
-	.align	64
-
+	.align	4
 #ifdef __svml_satanh_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 Log_tbl_H[32][1];
-	__declspec(align(64)) VUINT32 Log_tbl_L[32][1];
+typedef struct{
+	__declspec(align(4)) VUINT32 AbsMask[1][1];
 	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 AbsMask[16][1];
 	__declspec(align(64)) VUINT32 AddB5[16][1];
 	__declspec(align(64)) VUINT32 RcpBitMask[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
+	__declspec(align(64)) VUINT32 L2H[16][1];
+	__declspec(align(64)) VUINT32 L2L[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff3[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff2[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff1[16][1];
-	__declspec(align(64)) VUINT32 poly_coeff0[16][1];
-	__declspec(align(64)) VUINT32 Half[16][1];
-	__declspec(align(64)) VUINT32 L2H[16][1];
-	__declspec(align(64)) VUINT32 L2L[16][1];
 } __svml_satanh_data_internal_avx512;
 #endif
 __svml_satanh_data_internal_avx512:
-	/* Log_tbl_H */
-	.long	0x00000000
-	.long	0x3cfc0000
-	.long	0x3d780000
-	.long	0x3db78000
-	.long	0x3df10000
-	.long	0x3e14c000
-	.long	0x3e300000
-	.long	0x3e4a8000
-	.long	0x3e648000
-	.long	0x3e7dc000
-	.long	0x3e8b4000
-	.long	0x3e974000
-	.long	0x3ea30000
-	.long	0x3eae8000
-	.long	0x3eb9c000
-	.long	0x3ec4e000
-	.long	0x3ecfa000
-	.long	0x3eda2000
-	.long	0x3ee48000
-	.long	0x3eeea000
-	.long	0x3ef8a000
-	.long	0x3f013000
-	.long	0x3f05f000
-	.long	0x3f0aa000
-	.long	0x3f0f4000
-	.long	0x3f13d000
-	.long	0x3f184000
-	.long	0x3f1ca000
-	.long	0x3f20f000
-	.long	0x3f252000
-	.long	0x3f295000
-	.long	0x3f2d7000
-	/* Log_tbl_L */
+	/* Leave this at front so we can potentially save space due to
+	   smaller alignment constraint.  */
+	.align	4
+    /* AbsMask */
+	.long	0x7fffffff
+	.align	64
+__svml_satanh_data_internal_avx512_al64:
+	/* One */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* AddB5 */
+	.align	64
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	/* RcpBitMask */
+	.align	64
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	/* Log_tbl_L_lo */
 	.align	64
 	.long	0x00000000
 	.long	0x3726c39e
@@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38dedfac
 	.long	0x38ebfb5e
 	.long	0xb8e63c9f
+	/* Log_tbl_L_hi */
+	.align	64
 	.long	0xb85c1340
 	.long	0x38777bcd
 	.long	0xb6038656
@@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38f85db0
 	.long	0x37b4996f
 	.long	0xb8bfb3ca
-	/* One */
+	/* Log_tbl_H_lo */
 	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* AbsMask */
+	.long	0x00000000
+	.long	0x3cfc0000
+	.long	0x3d780000
+	.long	0x3db78000
+	.long	0x3df10000
+	.long	0x3e14c000
+	.long	0x3e300000
+	.long	0x3e4a8000
+	.long	0x3e648000
+	.long	0x3e7dc000
+	.long	0x3e8b4000
+	.long	0x3e974000
+	.long	0x3ea30000
+	.long	0x3eae8000
+	.long	0x3eb9c000
+	.long	0x3ec4e000
+	/* Log_tbl_H_hi */
 	.align	64
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* AddB5 */
+	.long	0x3ecfa000
+	.long	0x3eda2000
+	.long	0x3ee48000
+	.long	0x3eeea000
+	.long	0x3ef8a000
+	.long	0x3f013000
+	.long	0x3f05f000
+	.long	0x3f0aa000
+	.long	0x3f0f4000
+	.long	0x3f13d000
+	.long	0x3f184000
+	.long	0x3f1ca000
+	.long	0x3f20f000
+	.long	0x3f252000
+	.long	0x3f295000
+	.long	0x3f2d7000
+	/* L2H = log(2)_high */
 	.align	64
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
-	/* RcpBitMask */
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	/* L2L = log(2)_low */
 	.align	64
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
 	/* poly_coeff3 */
 	.align	64
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
 	/* poly_coeff2 */
 	.align	64
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
 	/* poly_coeff1 */
 	.align	64
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	/* poly_coeff0 */
-	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* Half */
-	.align	64
-	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
-	/* L2H = log(2)_high */
-	.align	64
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	/* L2L = log(2)_low */
-	.align	64
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
 	.align	64
+	.type	__svml_satanh_data_internal_avx512_al64, @object
+	.size	__svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
 	.type	__svml_satanh_data_internal_avx512, @object
 	.size	__svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
@ 2022-06-09 16:58   ` Noah Goldstein
  2022-06-09 17:05     ` H.J. Lu
  2022-06-09 16:58   ` [PATCH v3 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: libc-alpha

Improvements are:
    1. Reduce code size (-60 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Shrink rodata usage (-32 bytes).

The throughput improvement is not that significant (3-5%) as the
port 0 bottleneck is unavoidable.

       Function, New Time, Old Time, New / Old
_ZGVdN8v_atanhf,    2.799,    2.923,     0.958
---
 .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 405 +++++++++---------
 1 file changed, 202 insertions(+), 203 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index c1ea1c3353..43eb423831 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -30,305 +30,304 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
+/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might hhelp the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
 #define SgnMask				0
 #define sOne				32
-#define sPoly				64
-#define iBrkValue			320
-#define iOffExpoMask			352
-#define sHalf				384
-#define sSign				416
-#define sTopMask12			448
-#define TinyRange			480
-#define sLn2				512
+#define sTopMask12			64
+#define TinyRange			96
+#define iBrkValue			128
+#define iOffExpoMask			160
+#define sPoly				192
+#define sLn2				448
+#define sHalf				480
 
 #include <sysdep.h>
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_atanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	subq	$96, %rsp
-
+	/* Strip off the sign, so treat X as positive until right at the end */
+	vmovaps	ATANHF_DATA(SgnMask)(%rip), %ymm2
+	vandps	%ymm2, %ymm0, %ymm3
 	/* Load constants including One = 1 */
-	vmovups	sOne+__svml_satanh_data_internal(%rip), %ymm5
-	vmovups	sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
-	vmovaps	%ymm0, %ymm6
+	vmovups	ATANHF_DATA(sOne)(%rip), %ymm5
+	vsubps	%ymm3, %ymm5, %ymm1
+	vmovups	ATANHF_DATA(sTopMask12)(%rip), %ymm4
 
-	/* Strip off the sign, so treat X as positive until right at the end */
-	vandps	SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
-	vsubps	%ymm10, %ymm5, %ymm1
+	vrcpps	%ymm1, %ymm7
+	vsubps	%ymm1, %ymm5, %ymm9
+	vandps	%ymm4, %ymm7, %ymm6
+	vsubps	%ymm3, %ymm9, %ymm7
 
-	/*
-	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
-	 * the upper part UHi being <= 12 bits long. Then we have
-	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
-	 */
-	vaddps	%ymm10, %ymm10, %ymm14
+	/* No need to split sU when FMA is available */
+	vfnmadd213ps %ymm5, %ymm6, %ymm1
+	vmovaps	%ymm0, %ymm8
+	vfmadd213ps %ymm0, %ymm0, %ymm0
+	vfnmadd231ps %ymm6, %ymm7, %ymm1
 
 	/*
 	 * Check whether |X| < 1, in which case we use the main function.
 	 * Otherwise set the rangemask so that the callout will get used.
 	 * Note that this will also use the callout for NaNs since not(NaN < 1).
 	 */
-	vcmpnlt_uqps %ymm5, %ymm10, %ymm7
-	vsubps	%ymm1, %ymm5, %ymm9
-	vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
-	vrcpps	%ymm1, %ymm11
-	vsubps	%ymm10, %ymm9, %ymm12
-	vandps	%ymm13, %ymm11, %ymm0
+	vcmpnlt_uqps %ymm5, %ymm3, %ymm14
+	vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
 
-	/* No need to split sU when FMA is available */
-	vfnmadd213ps %ymm5, %ymm0, %ymm1
-	vmovaps	%ymm6, %ymm8
-	vfmadd213ps %ymm6, %ymm6, %ymm8
-	vfnmadd231ps %ymm0, %ymm12, %ymm1
+	/*
+	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+	 * the upper part UHi being <= 12 bits long. Then we have
+	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
+	 */
+	vaddps	%ymm3, %ymm3, %ymm3
 
 	/*
 	 * Split V as well into upper 12 bits and lower part, so that we can get
 	 * a preliminary quotient estimate without rounding error.
 	 */
-	vandps	%ymm13, %ymm14, %ymm15
-	vmovmskps %ymm7, %edx
-	vsubps	%ymm15, %ymm14, %ymm7
+	vandps	%ymm4, %ymm3, %ymm4
+	vsubps	%ymm4, %ymm3, %ymm7
 
 	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-	vmulps	%ymm15, %ymm0, %ymm10
+	vmulps	%ymm4, %ymm6, %ymm4
 
 	/* Compute D = E + E^2 */
 	vfmadd213ps %ymm1, %ymm1, %ymm1
 
-	/* Record the sign for eventual reincorporation. */
-	vandps	sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
+	/* Record the sign for eventual reincorporation.  */
+	vandnps	%ymm8, %ymm2, %ymm3
 
 	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-	vorps	%ymm3, %ymm8, %ymm2
-	vmulps	%ymm7, %ymm0, %ymm8
+	vorps	%ymm3, %ymm0, %ymm13
+	vmulps	%ymm7, %ymm6, %ymm2
 
 	/*
 	 * Compute R * (VHi + VLo) * (1 + E + E^2)
 	 * = R *  (VHi + VLo) * (1 + D)
 	 * = QHi + (QHi * D + QLo + QLo * D)
 	 */
-	vmulps	%ymm1, %ymm10, %ymm9
-	vfmadd213ps %ymm8, %ymm8, %ymm1
-	vaddps	%ymm1, %ymm9, %ymm1
 
-	/* reduction: compute r, n */
-	vmovups	iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
+	/*
+	 * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
+	 * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
+	 * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
+	 */
+	vmulps	%ymm1, %ymm4, %ymm6
+	vfmadd213ps %ymm2, %ymm2, %ymm1
+	vaddps	%ymm1, %ymm6, %ymm1
 
 	/*
 	 * Now finally accumulate the high and low parts of the
 	 * argument to log1p, H + L, with a final compensated summation.
 	 */
-	vaddps	%ymm1, %ymm10, %ymm12
-	vsubps	%ymm12, %ymm10, %ymm11
+	vaddps	%ymm1, %ymm4, %ymm2
+
+	/* reduction: compute r, n */
+	vmovups	ATANHF_DATA(iBrkValue)(%rip), %ymm9
 
 	/*
 	 * Now we feed into the log1p code, using H in place of _VARG1 and
 	 * later incorporating L into the reduced argument.
 	 * compute 1+x as high, low parts
 	 */
-	vmaxps	%ymm12, %ymm5, %ymm13
-	vminps	%ymm12, %ymm5, %ymm14
-	vaddps	%ymm11, %ymm1, %ymm0
-	vaddps	%ymm14, %ymm13, %ymm1
-	vpsubd	%ymm9, %ymm1, %ymm7
-	vsubps	%ymm1, %ymm13, %ymm15
-	vpsrad	$23, %ymm7, %ymm10
-	vpand	iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
-	vaddps	%ymm15, %ymm14, %ymm13
-	vpslld	$23, %ymm10, %ymm11
-	vpaddd	%ymm9, %ymm8, %ymm15
-	vaddps	%ymm13, %ymm0, %ymm14
-	vcvtdq2ps %ymm10, %ymm0
-	vpsubd	%ymm11, %ymm5, %ymm12
+	vmaxps	%ymm2, %ymm5, %ymm0
+	vminps	%ymm2, %ymm5, %ymm6
+
+	/* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
+	vsubps	%ymm2, %ymm4, %ymm2
+	vaddps	%ymm6, %ymm0, %ymm4
+	vpsubd	%ymm9, %ymm4, %ymm7
+	vsubps	%ymm4, %ymm0, %ymm4
+	vaddps	%ymm2, %ymm1, %ymm2
+	vmovaps	ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
+
+	vandps	%ymm1, %ymm7, %ymm0
+	vaddps	%ymm4, %ymm6, %ymm4
+	vandnps	%ymm7, %ymm1, %ymm6
+	vmovups	ATANHF_DATA(sPoly+0)(%rip), %ymm1
+	vpaddd	%ymm9, %ymm0, %ymm0
+	vaddps	%ymm4, %ymm2, %ymm4
+	vpsubd	%ymm6, %ymm5, %ymm6
 
 	/* polynomial evaluation */
-	vsubps	%ymm5, %ymm15, %ymm5
-	vmulps	%ymm14, %ymm12, %ymm1
-	vaddps	%ymm5, %ymm1, %ymm5
-	vmovups	sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
-	vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vmulps	%ymm1, %ymm5, %ymm7
-	vfmadd213ps %ymm5, %ymm5, %ymm7
+	vsubps	%ymm5, %ymm0, %ymm2
+	vfmadd231ps %ymm4, %ymm6, %ymm2
+	vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
+
+	vmulps	%ymm1, %ymm2, %ymm1
+	vfmadd213ps %ymm2, %ymm2, %ymm1
 
 	/* final reconstruction */
-	vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
+	vpsrad	$23, %ymm7, %ymm6
+	vcvtdq2ps %ymm6, %ymm2
+	vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
 
 	/* Finally, halve the result and reincorporate the sign */
-	vxorps	sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
-	vmulps	%ymm0, %ymm3, %ymm0
-	vblendvps %ymm4, %ymm2, %ymm0, %ymm0
+	vxorps	ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
+	vmulps	%ymm2, %ymm3, %ymm2
+	vmovmskps %ymm14, %edx
 	testl	%edx, %edx
 
+	vblendvps %ymm15, %ymm13, %ymm2, %ymm0
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
-
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	# LOE rbx rdx r12 r13 r14 r15 ymm0
+	/* No registers to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm6, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx ymm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
+	/* Save all already computed inputs.  */
+	vmovups	%ymm0, (%rsp)
+	/* Save original input (ymm8 unchanged up to this point).  */
+	vmovups	%ymm8, 32(%rsp)
 
-	/* Special inputs
-	 * processing loop
-	 */
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* All results have been written to (%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVdN8v_atanhf_avx2)
 
 	.section .rodata, "a"
 	.align	32
-
 #ifdef __svml_satanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
+typedef struct{
 	__declspec(align(32)) VUINT32 SgnMask[8][1];
 	__declspec(align(32)) VUINT32 sOne[8][1];
-	__declspec(align(32)) VUINT32 sPoly[8][8][1];
-	__declspec(align(32)) VUINT32 iBrkValue[8][1];
-	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-	__declspec(align(32)) VUINT32 sHalf[8][1];
-	__declspec(align(32)) VUINT32 sSign[8][1];
 	__declspec(align(32)) VUINT32 sTopMask12[8][1];
 	__declspec(align(32)) VUINT32 TinyRange[8][1];
+	__declspec(align(32)) VUINT32 iBrkValue[8][1];
+	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
+	__declspec(align(32)) VUINT32 sPoly[8][8][1];
 	__declspec(align(32)) VUINT32 sLn2[8][1];
+	__declspec(align(32)) VUINT32 sHalf[8][1];
 } __svml_satanh_data_internal;
 #endif
 __svml_satanh_data_internal:
 	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 	/* sOne = SP 1.0 */
 	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sPoly[] = SP polynomial */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* sTopMask12 */
+	.align	32
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	/* TinyRange */
 	.align	32
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
 	/* iBrkValue = SP 2/3 */
 	.align	32
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 	/* iOffExpoMask = SP significand mask */
 	.align	32
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sHalf */
-	.align	32
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	/* sSign */
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
-	/* sTopMask12 */
-	.align	32
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	/* TinyRange */
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	/* sPoly[] = SP polynomial */
 	.align	32
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
 	/* sLn2 = SP ln(2) */
 	.align	32
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	/* sHalf */
+	.align	32
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
 	.align	32
 	.type	__svml_satanh_data_internal, @object
 	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 3/7] x86: Improve svml_s_atanhf4_core_sse4.S
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
  2022-06-09 16:58   ` [PATCH v3 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 16:58   ` Noah Goldstein
  2022-06-09 17:07     ` H.J. Lu
  2022-06-09 16:58   ` [PATCH v3 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: libc-alpha

Improvements are:
    1. Reduce code size (-62 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata usage (-16 bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

       Function, New Time, Old Time, New / Old
_ZGVbN4v_atanhf,    8.821,    8.903,     0.991
---
 .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 378 ++++++++----------
 1 file changed, 169 insertions(+), 209 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
index 2d3ad2617f..37200b3601 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
@@ -30,96 +30,80 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
-#define SgnMask				0
-#define sOne				16
-#define sPoly				32
-#define iBrkValue			160
-#define iOffExpoMask			176
-#define sHalf				192
-#define sSign				208
-#define sTopMask12			224
-#define TinyRange			240
-#define sLn2				256
+/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+#define sOne				0
+#define SgnMask				16
+#define sTopMask12			32
+#define iBrkValue			48
+#define iOffExpoMask			64
+#define sPoly				80
+#define sLn2				208
+#define TinyRange			224
 
 #include <sysdep.h>
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_atanhf_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
 	movaps	%xmm0, %xmm5
 
 	/* Load constants including One = 1 */
-	movups	sOne+__svml_satanh_data_internal(%rip), %xmm4
+	movups	ATANHF_DATA(sOne)(%rip), %xmm4
 	movaps	%xmm5, %xmm3
 
 	/* Strip off the sign, so treat X as positive until right at the end */
-	movups	SgnMask+__svml_satanh_data_internal(%rip), %xmm7
-	movaps	%xmm4, %xmm8
-	andps	%xmm5, %xmm7
+	movups	ATANHF_DATA(SgnMask)(%rip), %xmm1
+	movaps	%xmm4, %xmm2
+	andps	%xmm1, %xmm0
 	movaps	%xmm4, %xmm10
-	movups	sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
+	movups	ATANHF_DATA(sTopMask12)(%rip), %xmm11
 	movaps	%xmm4, %xmm14
 	movaps	%xmm11, %xmm9
 
+
 	/*
 	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
 	 * the upper part UHi being <= 12 bits long. Then we have
 	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
 	 */
-	movaps	%xmm7, %xmm12
+	movaps	%xmm0, %xmm6
+	mulps	%xmm5, %xmm3
+	subps	%xmm0, %xmm2
+	addps	%xmm0, %xmm6
+	subps	%xmm2, %xmm10
+	addps	%xmm5, %xmm3
+	subps	%xmm0, %xmm10
+	andps	%xmm2, %xmm9
+
 
 	/*
 	 * Check whether |X| < 1, in which case we use the main function.
 	 * Otherwise set the rangemask so that the callout will get used.
 	 * Note that this will also use the callout for NaNs since not(NaN < 1).
 	 */
-	movaps	%xmm7, %xmm6
-	movaps	%xmm7, %xmm2
-	cmpnltps %xmm4, %xmm6
-	cmpltps	TinyRange+__svml_satanh_data_internal(%rip), %xmm2
-	mulps	%xmm5, %xmm3
-	subps	%xmm7, %xmm8
-	addps	%xmm7, %xmm12
-	movmskps %xmm6, %edx
-	subps	%xmm8, %xmm10
-	addps	%xmm5, %xmm3
-	subps	%xmm7, %xmm10
-	andps	%xmm8, %xmm9
+	rcpps	%xmm9, %xmm7
+	subps	%xmm9, %xmm2
+	andps	%xmm11, %xmm7
 
-	/*
-	 * Now we feed into the log1p code, using H in place of _VARG1 and
-	 * later incorporating L into the reduced argument.
-	 * compute 1+x as high, low parts
-	 */
-	movaps	%xmm4, %xmm7
-
-	/*
-	 * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
-	 * The first FMR is exact (we force R to 12 bits just in case it
-	 * isn't already, to make absolutely sure), and since E is ~ 2^-12,
-	 * the rounding error in the other one is acceptable.
-	 */
-	rcpps	%xmm9, %xmm15
-	subps	%xmm9, %xmm8
-	andps	%xmm11, %xmm15
 
 	/*
 	 * Split V as well into upper 12 bits and lower part, so that we can get
 	 * a preliminary quotient estimate without rounding error.
 	 */
-	andps	%xmm12, %xmm11
-	mulps	%xmm15, %xmm9
-	addps	%xmm8, %xmm10
-	subps	%xmm11, %xmm12
+	andps	%xmm6, %xmm11
+	mulps	%xmm7, %xmm9
+	addps	%xmm2, %xmm10
+	subps	%xmm11, %xmm6
 
 	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-	mulps	%xmm15, %xmm11
-	mulps	%xmm15, %xmm10
+	mulps	%xmm7, %xmm11
+	mulps	%xmm7, %xmm10
 	subps	%xmm9, %xmm14
-	mulps	%xmm12, %xmm15
+	mulps	%xmm6, %xmm7
 	subps	%xmm10, %xmm14
 
 	/* Compute D = E + E^2 */
@@ -127,8 +111,8 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	movaps	%xmm4, %xmm8
 	mulps	%xmm14, %xmm13
 
-	/* reduction: compute r, n */
-	movdqu	iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
+	/* reduction: compute r,n */
+	movdqu	ATANHF_DATA(iBrkValue)(%rip), %xmm9
 	addps	%xmm13, %xmm14
 
 	/*
@@ -136,168 +120,149 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	 * = R *  (VHi + VLo) * (1 + D)
 	 * = QHi + (QHi * D + QLo + QLo * D)
 	 */
-	movaps	%xmm14, %xmm0
-	mulps	%xmm15, %xmm14
-	mulps	%xmm11, %xmm0
-	addps	%xmm14, %xmm15
-	movdqu	iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
+	movaps	%xmm14, %xmm2
+	mulps	%xmm7, %xmm14
+	mulps	%xmm11, %xmm2
+	addps	%xmm14, %xmm7
+	movdqu	ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
 	movaps	%xmm4, %xmm14
 
 	/* Record the sign for eventual reincorporation. */
-	movups	sSign+__svml_satanh_data_internal(%rip), %xmm1
-	addps	%xmm15, %xmm0
+	addps	%xmm7, %xmm2
+
 
 	/*
 	 * Now finally accumulate the high and low parts of the
 	 * argument to log1p, H + L, with a final compensated summation.
 	 */
-	movaps	%xmm0, %xmm6
-	andps	%xmm5, %xmm1
-
+	movaps	%xmm2, %xmm6
+	andnps	%xmm5, %xmm1
+	movaps	%xmm4, %xmm7
 	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-	orps	%xmm1, %xmm3
 	addps	%xmm11, %xmm6
 	maxps	%xmm6, %xmm7
 	minps	%xmm6, %xmm8
 	subps	%xmm6, %xmm11
 	movaps	%xmm7, %xmm10
-	andps	%xmm2, %xmm3
 	addps	%xmm8, %xmm10
-	addps	%xmm11, %xmm0
+	addps	%xmm11, %xmm2
 	subps	%xmm10, %xmm7
 	psubd	%xmm9, %xmm10
-	addps	%xmm7, %xmm8
+	addps	%xmm8, %xmm7
 	pand	%xmm10, %xmm12
 	psrad	$23, %xmm10
 	cvtdq2ps %xmm10, %xmm13
-	addps	%xmm8, %xmm0
+	addps	%xmm7, %xmm2
 
 	/* final reconstruction */
-	mulps	sLn2+__svml_satanh_data_internal(%rip), %xmm13
 	pslld	$23, %xmm10
 	paddd	%xmm9, %xmm12
 	psubd	%xmm10, %xmm14
 
 	/* polynomial evaluation */
 	subps	%xmm4, %xmm12
-	mulps	%xmm0, %xmm14
-	movups	sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
-	addps	%xmm12, %xmm14
-	mulps	%xmm14, %xmm0
+	mulps	%xmm14, %xmm2
+	movups	ATANHF_DATA(sPoly+0)(%rip), %xmm7
+	addps	%xmm12, %xmm2
+	mulps	%xmm2, %xmm7
+
 
 	/* Finally, halve the result and reincorporate the sign */
-	movups	sHalf+__svml_satanh_data_internal(%rip), %xmm4
-	pxor	%xmm1, %xmm4
-	addps	sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	addps	sPoly+__svml_satanh_data_internal(%rip), %xmm0
-	mulps	%xmm14, %xmm0
-	mulps	%xmm14, %xmm0
-	addps	%xmm0, %xmm14
-	movaps	%xmm2, %xmm0
-	addps	%xmm13, %xmm14
-	mulps	%xmm14, %xmm4
-	andnps	%xmm4, %xmm0
-	orps	%xmm3, %xmm0
-	testl	%edx, %edx
+	addps	ATANHF_DATA(sPoly+16)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+32)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+48)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+64)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+80)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	ATANHF_DATA(sPoly+96)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	movaps	ATANHF_DATA(sPoly+112)(%rip), %xmm6
+	addps	%xmm6, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	ATANHF_DATA(sLn2)(%rip), %xmm13
+	/* We can build `sHalf` with `sPoly & sOne`.  */
+	andps	%xmm4, %xmm6
+	orps	%xmm1, %xmm3
+	xorps	%xmm6, %xmm1
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
+	addps	%xmm2, %xmm7
+	addps	%xmm13, %xmm7
+	mulps	%xmm7, %xmm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	/* Finish check of NaNs.  */
+	cmpleps	%xmm0, %xmm4
+	movmskps %xmm4, %edx
+	cmpltps	ATANHF_DATA(TinyRange)(%rip), %xmm0
 
-L(EXIT):
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
+	andps	%xmm0, %xmm3
+	andnps	%xmm1, %xmm0
+	orps	%xmm3, %xmm0
+
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	/* No registers to restore on fast path.  */
 	ret
-	cfi_def_cfa_offset(80)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm5, 32(%rsp)
-	movups	%xmm0, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 edx
-
-	xorl	%eax, %eax
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
-
+	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+       call entry will be 16-byte aligned. */
+	subq	$56, %rsp
+	cfi_def_cfa_offset(64)
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm5, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+       encoding for many instructions (as compared with r12/r13). */
+	movq	%rbx, (%rsp)
+	cfi_offset(rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset(rbp, -56)
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
-
-	movss	%xmm0, 48(%rsp, %r14, 4)
-
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+	/* All results have been written to 24(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore(rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore(rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset(8)
+	ret
 END(_ZGVbN4v_atanhf_sse4)
 
 	.section .rodata, "a"
@@ -305,56 +270,51 @@ END(_ZGVbN4v_atanhf_sse4)
 
 #ifdef __svml_satanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 SgnMask[4][1];
+typedef struct{
 	__declspec(align(16)) VUINT32 sOne[4][1];
-	__declspec(align(16)) VUINT32 sPoly[8][4][1];
+	__declspec(align(16)) VUINT32 SgnMask[4][1];
+	__declspec(align(16)) VUINT32 sTopMask12[4][1];
 	__declspec(align(16)) VUINT32 iBrkValue[4][1];
 	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-	__declspec(align(16)) VUINT32 sHalf[4][1];
-	__declspec(align(16)) VUINT32 sSign[4][1];
-	__declspec(align(16)) VUINT32 sTopMask12[4][1];
-	__declspec(align(16)) VUINT32 TinyRange[4][1];
+	__declspec(align(16)) VUINT32 sPoly[8][4][1];
 	__declspec(align(16)) VUINT32 sLn2[4][1];
+	__declspec(align(16)) VUINT32 TinyRange[4][1];
 } __svml_satanh_data_internal;
 #endif
+
 __svml_satanh_data_internal:
-	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 	/* sOne = SP 1.0 */
 	.align	16
 	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sPoly[] = SP polynomial */
+	/* SgnMask */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* sTopMask12 */
 	.align	16
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
 	/* iBrkValue = SP 2/3 */
 	.align	16
 	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
+	/* iOffExpoMask = SP significand mask ==*/
 	.align	16
 	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sHalf */
-	.align	16
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	/* sSign */
+
+	/* sPoly[] = SP polynomial */
 	.align	16
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	/* sTopMask12 */
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
+
+	/* sLn2 = SP ln(2) */
 	.align	16
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 	/* TinyRange */
 	.align	16
 	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-	/* sLn2 = SP ln(2) */
-	.align	16
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 	.align	16
 	.type	__svml_satanh_data_internal, @object
 	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
  2022-06-09 16:58   ` [PATCH v3 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
  2022-06-09 16:58   ` [PATCH v3 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09 16:58   ` Noah Goldstein
  2022-06-09 17:07     ` H.J. Lu
  2022-06-09 16:58   ` [PATCH v3 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: libc-alpha

Optimizations are:
    1. Reduce code size (-67 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata usage (-448 bytes).

Result is roughly a 14% speedup:

       Function, New Time, Old Time, New / Old
_ZGVeN16v_tanhf,    0.649,    0.752,     0.863
---
 .../multiarch/svml_s_tanhf16_core_avx512.S    | 527 ++++++++++--------
 1 file changed, 287 insertions(+), 240 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index 5b1f9f151c..7edc74a116 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -70,310 +70,357 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
+/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+
+/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
  */
-#define _sC				0
-#define _sP0				128
-#define _sP2				256
-#define _sP3				384
-#define _sP4				512
-#define _sP5				640
-#define _sP6				768
-#define _sP7				896
-#define _iExpMantMask_UISA		1024
-#define _iMinIdxOfsMask_UISA		1088
-#define _iMaxIdxMask_UISA		1152
-#define _sSignMask			1216
-#define _sAbsMask			1280
-#define _iExpMantMask			1344
-#define _iExpMask			1408
-#define _iMinIdxOfsMask			1472
-#define _iMaxIdxMask			1536
+#define _iExpMantMask_UISA		0
+#define _iMinIdxOfsMask_UISA		4
+#define _iMaxIdxMask_UISA		8
+#define _iExpMask			12
+
+/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
+   each.  */
+#define _sC_lo				0
+#define _sC_hi				64
+#define _sP7_lo				128
+#define _sP7_hi				192
+#define _sSignMask			256
+#define _sP6_lo				320
+#define _sP6_hi				384
+#define _sP5_lo				448
+#define _sP5_hi				512
+#define _sP4_lo				576
+#define _sP4_hi				640
+#define _sP3_lo				704
+#define _sP3_hi				768
+#define _sP2_lo				832
+#define _sP2_hi				896
+#define _sP0_lo				960
+#define _sP0_hi				1024
 
 #include <sysdep.h>
+#define TANHF_DATA(x)			((x)+__svml_stanh_data_internal_al64)
+#define TANHF_DATA_UNALIGNED(x)		((x)+__svml_stanh_data_internal)
 
 	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_tanhf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	vmovaps	%zmm0, %zmm1
-	vmovups	__svml_stanh_data_internal(%rip), %zmm9
-	vmovups	_sP6+__svml_stanh_data_internal(%rip), %zmm11
-	vmovups	_sP5+__svml_stanh_data_internal(%rip), %zmm12
-	vmovups	_sP4+__svml_stanh_data_internal(%rip), %zmm13
-	vmovups	_sP3+__svml_stanh_data_internal(%rip), %zmm14
-	vmovups	_sP2+__svml_stanh_data_internal(%rip), %zmm15
-	vpternlogd $255, %zmm2, %zmm2, %zmm2
-	vandps	_sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
-	vandps	_sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
-
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpandd	_iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
-	vpsubd	_iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
-	vpcmpd	$2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
+	vpandd	TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
+	vpsubd	TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
 
-	/*
-	 *  small table specific variables *
-	 *  Constant loading
-	 */
-	vpxord	%zmm5, %zmm5, %zmm5
-
-	/* if VMIN, VMAX is defined for I type */
-	vpmaxsd	%zmm5, %zmm4, %zmm6
-	vpminsd	_iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
-	vpsrld	$21, %zmm7, %zmm10
-	vmovups	_sP7+__svml_stanh_data_internal(%rip), %zmm4
-	vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
-	vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
-	vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
-	vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
-	vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
-	vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
-	vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
-	vpandnd	%zmm3, %zmm3, %zmm2{%k1}
-	vptestmd %zmm2, %zmm2, %k0
-	vmovups	_sP0+__svml_stanh_data_internal(%rip), %zmm3
-	vsubps	{rn-sae}, %zmm9, %zmm8, %zmm2
-	kmovw	%k0, %edx
-	vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
-	vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
-	vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
-	vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
-	vorps	%zmm0, %zmm4, %zmm0
-	testl	%edx, %edx
+	/* Selection arguments between [0, 0x03e00000] into zmm3.  */
+	vpxord	%zmm3, %zmm3, %zmm3
+	vpmaxsd	%zmm3, %zmm2, %zmm3
+	vpminsd	TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
+	/* Setup permute indices in zmm3.  */
+	vpsrld	$21, %zmm3, %zmm3
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	/* Store if there are any special cases in k1.  */
+	vpcmpd	$6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
 
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
+	vmovaps	TANHF_DATA(_sC_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vmovaps	TANHF_DATA(_sP7_lo)(%rip), %zmm2
+	vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm1, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
+	/* Store absolute values of inputs in zmm1.  */
+	vmovaps	TANHF_DATA(_sSignMask)(%rip), %zmm4
+	vandnps	%zmm0, %zmm4, %zmm1
+	vsubps	{rn-sae}, %zmm5, %zmm1, %zmm1
 
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
+	vmovaps	TANHF_DATA(_sP6_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
 
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	vmovaps	TANHF_DATA(_sP5_lo)(%rip), %zmm6
+	vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
+
+	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	vmovaps	TANHF_DATA(_sP4_lo)(%rip), %zmm7
+	vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
+	vmovaps	TANHF_DATA(_sP3_lo)(%rip), %zmm8
+	vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
 
-	/* Special inputs
-	 * processing loop
+	vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
+
+	vmovaps	TANHF_DATA(_sP2_lo)(%rip), %zmm9
+	vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
+
+	vmovaps	TANHF_DATA(_sP0_lo)(%rip), %zmm10
+	vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
+
+	vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
+
+	kmovw	%k1, %edx
+	testl	%edx, %edx
+
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
+	/* Wait until after branch of write over zmm0.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
+
+	/* No stack restoration on the fastpath.  */
+	ret
+
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   more so than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
+
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+
+	/* Save original input (zmm0 unchanged up to this point).  */
+	vmovaps	%zmm0, 64(%rsp)
+	/* Save all already computed inputs.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
+	vmovaps	%zmm0, (%rsp)
 
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
+	/* All results have been written to (%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVeN16v_tanhf_skx)
 
 	.section .rodata, "a"
-	.align	64
-
+	.align	16
 #ifdef __svml_stanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 _sC[32][1];
-	__declspec(align(64)) VUINT32 _sP0[32][1];
-	__declspec(align(64)) VUINT32 _sP2[32][1];
-	__declspec(align(64)) VUINT32 _sP3[32][1];
-	__declspec(align(64)) VUINT32 _sP4[32][1];
-	__declspec(align(64)) VUINT32 _sP5[32][1];
-	__declspec(align(64)) VUINT32 _sP6[32][1];
-	__declspec(align(64)) VUINT32 _sP7[32][1];
-	__declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
-	__declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
-	__declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
+typedef struct
+	{
+	__declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
+	__declspec(align(4)) VUINT32 _iExpMask[1][1];
+	__declspec(align(64)) VUINT32 _sC_lo[16][1];
+	__declspec(align(64)) VUINT32 _sC_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP7_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP7_hi[16][1];
 	__declspec(align(64)) VUINT32 _sSignMask[16][1];
-	__declspec(align(64)) VUINT32 _sAbsMask[16][1];
-	__declspec(align(64)) VUINT32 _iExpMantMask[16][1];
-	__declspec(align(64)) VUINT32 _iExpMask[16][1];
-	__declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
-	__declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
+	__declspec(align(64)) VUINT32 _sP6_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP6_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP5_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP5_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP4_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP4_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP3_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP3_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP2_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP2_hi[16][1];
+	__declspec(align(64)) VUINT32 _sP0_lo[16][1];
+	__declspec(align(64)) VUINT32 _sP0_hi[16][1];
 } __svml_stanh_data_internal;
 #endif
+
 __svml_stanh_data_internal:
-	/* _sC */
+	.align	4
+	/* _iExpMantMask_UISA */
+	.long	0x7fe00000
+
+	.align	4
+	/* _iMinIdxOfsMask_UISA */
+	.long	0x3d400000
+
+	.align	4
+	/* _iMaxIdxMask_UISA */
+	.long	0x03e00000
+
+	.align	4
+	/* _iExpMask */
+	.long	0x7f000000
+
+	.align	64
+__svml_stanh_data_internal_al64:
+	.align	64
+	/* _sC_lo */
 	.long	0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
 	.long	0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
 	.long	0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
 	.long	0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
+
+	.align	64
+	/* _sC_hi */
 	.long	0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
 	.long	0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
 	.long	0x40500000, 0x40700000, 0x40900000, 0x40b00000
 	.long	0x40d00000, 0x40f00000, 0x41100000, 0x00000000
-	/* p0 */
-	.align	64
-	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
-	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
-	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
-	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
-	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
-	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
-	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
-	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
-	/* p2 */
-	.align	64
-	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
-	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
-	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
-	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
-	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
-	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
-	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
-	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
-	/* p3 */
+
 	.align	64
-	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
-	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
-	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
-	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
-	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
-	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
-	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
-	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
-	/* p4 */
+	/* _sP7_lo */
+	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
+	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
+	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
+	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
+
 	.align	64
-	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
-	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
-	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
-	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
-	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
-	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
-	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
-	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
-	/* p5 */
+	/* _sP7_hi */
+	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
+	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
+	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
+	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
+
 	.align	64
-	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
-	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
-	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
-	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
-	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
-	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
-	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
-	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
-	/* p6 */
+	/* _sSignMask */
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+
 	.align	64
+	/* _sP6_lo */
 	.long	0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
 	.long	0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
 	.long	0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
 	.long	0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
+
+	.align	64
+	/* _sP6_hi */
 	.long	0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
 	.long	0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
 	.long	0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
 	.long	0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
-	/* p7 */
+
 	.align	64
-	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
-	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
-	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
-	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
-	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
-	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
-	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
-	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
+	/* _sP5_lo */
+	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
+	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
+	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
+	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
+
 	.align	64
-	.long	0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMantMask_UISA */
+	/* _sP5_hi */
+	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
+	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
+	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
+	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
+
 	.align	64
-	.long	0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000 /* _iMinIdxOfsMask_UISA */
+	/* _sP4_lo */
+	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
+	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
+	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
+	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
+
 	.align	64
-	.long	0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000 /* _iMaxIdxMask_UISA */
+	/* _sP4_hi */
+	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
+	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
+	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
+	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
+
 	.align	64
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
+	/* _sP3_lo */
+	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
+	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
+	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
+	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
+
 	.align	64
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
+	/* _sP3_hi */
+	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
+	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
+	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
+	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
+
 	.align	64
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
+	/* _sP2_lo */
+	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
+	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
+	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
+	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
+
 	.align	64
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
+	/* _sP2_hi */
+	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
+	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
+	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
+	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
+
 	.align	64
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
+	/* _sP0_lo */
+	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
+	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
+	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
+	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
+
 	.align	64
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
+	/* _sP0_hi */
+	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
+	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
+	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
+	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
+
 	.align	64
+	.type	__svml_stanh_data_internal_al64, @object
+	.size	__svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
 	.type	__svml_stanh_data_internal, @object
 	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-06-09 16:58   ` [PATCH v3 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
@ 2022-06-09 16:58   ` Noah Goldstein
  2022-06-09 17:11     ` H.J. Lu
  2022-06-09 16:58   ` [PATCH v3 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: libc-alpha

tanhf-avx2 and tanhf-sse4 use the same data tables so we can save
over 4kb using a shared datatable. This does increase the memory
footprint of the sse4 version (as now all the targets are 32 bytes
instead of 16), generally it seems worth the code size save.

NB: This patch doesn't do anything itself, it is setup for future
patches.
---
 .../fpu/multiarch/svml_s_tanhf_rodata.S       | 621 ++++++++++++++++++
 1 file changed, 621 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
new file mode 100644
index 0000000000..904fe5f588
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
@@ -0,0 +1,621 @@
+/* Datatables for  tanhf AVX2 and tanhf SSE4.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/* Offsets are ordered by use in the function. On cold-starts this
+   might help the prefetcher. If the streaming prefetchers kick in it
+   will prefetch into the lookup table.  */
+#define _iExpMantMask			0
+#define _iMinIdxOfsMask			32
+#define _iMaxIdxMask			64
+#define _sAbsMask			96
+#define _iExpMask			128
+#define _lookupTable			160
+
+#define TANHF_DATA(offset)		((offset)+__svml_stanh_data_internal_avx2)
+#ifndef ONLY_DECL_OFFSET
+	.section .rodata, "a"
+	.align	32
+
+# ifdef __svml_stanh_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct
+	{
+	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
+	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
+	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
+	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
+	__declspec(align(32)) VUINT32 _iExpMask[8][1];
+	__declspec(align(32)) VUINT32 _lookupTable[(134*4)][2];
+} __svml_stanh_data_internal;
+# endif
+
+
+__svml_stanh_data_internal:
+	.globl	__svml_stanh_data_internal_avx2
+__svml_stanh_data_internal_avx2:
+	.align	32
+	/* _iExpMantMask.  */
+	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
+	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
+
+	.align	32
+	/* _iMinIdxOfsMask.  */
+	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
+	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
+
+	.align	32
+	/* _iMaxIdxMask.  */
+	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
+	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
+
+	.align	32
+	/* _sAbsMask.  */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+
+	.align	32
+	/* _iExpMask.  */
+	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
+	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
+
+	.align	32
+	/* _lookupTable.  */
+	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500].  */
+	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01.  */
+	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00.  */
+	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06.  */
+	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01.  */
+	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08.  */
+	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00.  */
+	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05.  */
+	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01.  */
+	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08.  */
+	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00.  */
+	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04.  */
+	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01.  */
+	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08.  */
+	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00.  */
+	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04.  */
+	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01.  */
+	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08.  */
+	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00.  */
+	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04.  */
+	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01.  */
+	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08.  */
+	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00.  */
+	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04.  */
+	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01.  */
+	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08.  */
+	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00.  */
+	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04.  */
+	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01.  */
+	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08.  */
+	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00.  */
+	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04.  */
+	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01.  */
+	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07.  */
+	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00.  */
+	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04.  */
+	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01.  */
+	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07.  */
+	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00.  */
+	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04.  */
+	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01.  */
+	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07.  */
+	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00.  */
+	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04.  */
+	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01.  */
+	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07.  */
+	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00.  */
+	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04.  */
+	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01.  */
+	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07.  */
+	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00.  */
+	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04.  */
+	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01.  */
+	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07.  */
+	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00.  */
+	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04.  */
+	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01.  */
+	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07.  */
+	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00.  */
+	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04.  */
+	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01.  */
+	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07.  */
+	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00.  */
+	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04.  */
+	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01.  */
+	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07.  */
+	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00.  */
+	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04.  */
+	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01.  */
+	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07.  */
+	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00.  */
+	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04.  */
+	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01.  */
+	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07.  */
+	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00.  */
+	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04.  */
+	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01.  */
+	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06.  */
+	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00.  */
+	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04.  */
+	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01.  */
+	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06.  */
+	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00.  */
+	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03.  */
+	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01.  */
+	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06.  */
+	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00.  */
+	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03.  */
+	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01.  */
+	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06.  */
+	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00.  */
+	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03.  */
+	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01.  */
+	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06.  */
+	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00.  */
+	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03.  */
+	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01.  */
+	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06.  */
+	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00.  */
+	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03.  */
+	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01.  */
+	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06.  */
+	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00.  */
+	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03.  */
+	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01.  */
+	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06.  */
+	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00.  */
+	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03.  */
+	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01.  */
+	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06.  */
+	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00.  */
+	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03.  */
+	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01.  */
+	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06.  */
+	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00.  */
+	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03.  */
+	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01.  */
+	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06.  */
+	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00.  */
+	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03.  */
+	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01.  */
+	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05.  */
+	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00.  */
+	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03.  */
+	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01.  */
+	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05.  */
+	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00.  */
+	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03.  */
+	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01.  */
+	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05.  */
+	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00.  */
+	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03.  */
+	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01.  */
+	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05.  */
+	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00.  */
+	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03.  */
+	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01.  */
+	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05.  */
+	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00.  */
+	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03.  */
+	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01.  */
+	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05.  */
+	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00.  */
+	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03.  */
+	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01.  */
+	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05.  */
+	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00.  */
+	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03.  */
+	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01.  */
+	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05.  */
+	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00.  */
+	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02.  */
+	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01.  */
+	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05.  */
+	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00.  */
+	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02.  */
+	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01.  */
+	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05.  */
+	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00.  */
+	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02.  */
+	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01.  */
+	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04.  */
+	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00.  */
+	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02.  */
+	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01.  */
+	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04.  */
+	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00.  */
+	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02.  */
+	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01.  */
+	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04.  */
+	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00.  */
+	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02.  */
+	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01.  */
+	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04.  */
+	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00.  */
+	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02.  */
+	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01.  */
+	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04.  */
+	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00.  */
+	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02.  */
+	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01.  */
+	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04.  */
+	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00.  */
+	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02.  */
+	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01.  */
+	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04.  */
+	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00.  */
+	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02.  */
+	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01.  */
+	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04.  */
+	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00.  */
+	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02.  */
+	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01.  */
+	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04.  */
+	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00.  */
+	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02.  */
+	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01.  */
+	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04.  */
+	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00.  */
+	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02.  */
+	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01.  */
+	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04.  */
+	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00.  */
+	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02.  */
+	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01.  */
+	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04.  */
+	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00.  */
+	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02.  */
+	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01.  */
+	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03.  */
+	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00.  */
+	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02.  */
+	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01.  */
+	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03.  */
+	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00.  */
+	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02.  */
+	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01.  */
+	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03.  */
+	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00.  */
+	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02.  */
+	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01.  */
+	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03.  */
+	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00.  */
+	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02.  */
+	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01.  */
+	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03.  */
+	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00.  */
+	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01.  */
+	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01.  */
+	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03.  */
+	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00.  */
+	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01.  */
+	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01.  */
+	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03.  */
+	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00.  */
+	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01.  */
+	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01.  */
+	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03.  */
+	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00.  */
+	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01.  */
+	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01.  */
+	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03.  */
+	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00.  */
+	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01.  */
+	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01.  */
+	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03.  */
+	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00.  */
+	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01.  */
+	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01.  */
+	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03.  */
+	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00.  */
+	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01.  */
+	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01.  */
+	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03.  */
+	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00.  */
+	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01.  */
+	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01.  */
+	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03.  */
+	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00.  */
+	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01.  */
+	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02.  */
+	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02.  */
+	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00.  */
+	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01.  */
+	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02.  */
+	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02.  */
+	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00.  */
+	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01.  */
+	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02.  */
+	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02.  */
+	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00.  */
+	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01.  */
+	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02.  */
+	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02.  */
+	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00.  */
+	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01.  */
+	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02.  */
+	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02.  */
+	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00.  */
+	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01.  */
+	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03.  */
+	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02.  */
+	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00.  */
+	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01.  */
+	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03.  */
+	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02.  */
+	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00.  */
+	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01.  */
+	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02.  */
+	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02.  */
+	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00.  */
+	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01.  */
+	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02.  */
+	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02.  */
+	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00.  */
+	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01.  */
+	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02.  */
+	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02.  */
+	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00.  */
+	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01.  */
+	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02.  */
+	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02.  */
+	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00.  */
+	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01.  */
+	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02.  */
+	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02.  */
+	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00.  */
+	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01.  */
+	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02.  */
+	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02.  */
+	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00.  */
+	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01.  */
+	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02.  */
+	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02.  */
+	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00.  */
+	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01.  */
+	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02.  */
+	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02.  */
+	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00.  */
+	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01.  */
+	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02.  */
+	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02.  */
+	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00.  */
+	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01.  */
+	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01.  */
+	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02.  */
+	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00.  */
+	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01.  */
+	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01.  */
+	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02.  */
+	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00.  */
+	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01.  */
+	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01.  */
+	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02.  */
+	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00.  */
+	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01.  */
+	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01.  */
+	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02.  */
+	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00.  */
+	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01.  */
+	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01.  */
+	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02.  */
+	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00.  */
+	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01.  */
+	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01.  */
+	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02.  */
+	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00.  */
+	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01.  */
+	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01.  */
+	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02.  */
+	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00.  */
+	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01.  */
+	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02.  */
+	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02.  */
+	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00.  */
+	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01.  */
+	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02.  */
+	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02.  */
+	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00.  */
+	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01.  */
+	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02.  */
+	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03.  */
+	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00.  */
+	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01.  */
+	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02.  */
+	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02.  */
+	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00.  */
+	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01.  */
+	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02.  */
+	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02.  */
+	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00.  */
+	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01.  */
+	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02.  */
+	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02.  */
+	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00.  */
+	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01.  */
+	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02.  */
+	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01.  */
+	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01.  */
+	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01.  */
+	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02.  */
+	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01.  */
+	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01.  */
+	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01.  */
+	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02.  */
+	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01.  */
+	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01.  */
+	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01.  */
+	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02.  */
+	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01.  */
+	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01.  */
+	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01.  */
+	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02.  */
+	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01.  */
+	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01.  */
+	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01.  */
+	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02.  */
+	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01.  */
+	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01.  */
+	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01.  */
+	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02.  */
+	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01.  */
+	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01.  */
+	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01.  */
+	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02.  */
+	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01.  */
+	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01.  */
+	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01.  */
+	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02.  */
+	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01.  */
+	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01.  */
+	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01.  */
+	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02.  */
+	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01.  */
+	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01.  */
+	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02.  */
+	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03.  */
+	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01.  */
+	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01.  */
+	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02.  */
+	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03.  */
+	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01.  */
+	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01.  */
+	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02.  */
+	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03.  */
+	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01.  */
+	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01.  */
+	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02.  */
+	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03.  */
+	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01.  */
+	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01.  */
+	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02.  */
+	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03.  */
+	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01.  */
+	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01.  */
+	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02.  */
+	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03.  */
+	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01.  */
+	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01.  */
+	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02.  */
+	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03.  */
+	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01.  */
+	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02.  */
+	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02.  */
+	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03.  */
+	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01.  */
+	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02.  */
+	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02.  */
+	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03.  */
+	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01.  */
+	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02.  */
+	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02.  */
+	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03.  */
+	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01.  */
+	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02.  */
+	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03.  */
+	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04.  */
+	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01.  */
+	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02.  */
+	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03.  */
+	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04.  */
+	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01.  */
+	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02.  */
+	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03.  */
+	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04.  */
+	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01.  */
+	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02.  */
+	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03.  */
+	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04.  */
+	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01.  */
+	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03.  */
+	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03.  */
+	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05.  */
+	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01.  */
+	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03.  */
+	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03.  */
+	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05.  */
+	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01.  */
+	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03.  */
+	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04.  */
+	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05.  */
+	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01.  */
+	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03.  */
+	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04.  */
+	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05.  */
+	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01.  */
+	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03.  */
+	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04.  */
+	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05.  */
+	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01.  */
+	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03.  */
+	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04.  */
+	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06.  */
+	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01.  */
+	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04.  */
+	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04.  */
+	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06.  */
+	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01.  */
+	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04.  */
+	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05.  */
+	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06.  */
+	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01.  */
+	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04.  */
+	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05.  */
+	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06.  */
+	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01.  */
+	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04.  */
+	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05.  */
+	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06.  */
+	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01.  */
+	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04.  */
+	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05.  */
+	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07.  */
+	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01.  */
+	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05.  */
+	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06.  */
+	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07.  */
+	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01.  */
+	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05.  */
+	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06.  */
+	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07.  */
+	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01.  */
+	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05.  */
+	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06.  */
+	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08.  */
+	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01.  */
+	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06.  */
+	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07.  */
+	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08.  */
+	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01.  */
+	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06.  */
+	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07.  */
+	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09.  */
+	.quad	0x3ff0000000000000
+	.quad	0x0000000000000000
+	.quad	0x0000000000000000
+	.quad	0x0000000000000000
+
+	.align	32
+	.type	__svml_stanh_data_internal_avx2, @object
+	.size	__svml_stanh_data_internal_avx2, .-__svml_stanh_data_internal_avx2
+	.type	__svml_stanh_data_internal, @object
+	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-06-09 16:58   ` [PATCH v3 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
@ 2022-06-09 16:58   ` Noah Goldstein
  2022-06-09 17:09     ` H.J. Lu
  2022-06-09 16:58   ` [PATCH v3 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
  2022-06-09 17:04   ` [PATCH v3 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: libc-alpha

Optimizations are:
    1. Reduce code size (-81 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata size (-32 bytes).

Result is roughly a 17-18% speedup:

       Function, New Time, Old Time, New / Old
_ZGVdN8v_tanhf,     1.977,    2.402,     0.823
---
 .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 912 ++++--------------
 1 file changed, 171 insertions(+), 741 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index c5c87bf5b0..c40cd57691 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -70,773 +70,203 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP				0
-#define _sSignMask			4288
-#define _sAbsMask			4320
-#define _iExpMantMask			4352
-#define _iExpMask			4384
-#define _iMinIdxOfsMask			4416
-#define _iMaxIdxMask			4448
-
 #include <sysdep.h>
 
+/* tanhf data tables for avx2 and sse4 implementatins defined here.
+ */
+#include "svml_s_tanhf_rodata.S"
+
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_tanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	pushq	%r12
-	subq	$120, %rsp
-	lea	_dbP+16+__svml_stanh_data_internal(%rip), %r10
-	vmovaps	%ymm0, %ymm12
-
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpand	_iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
+	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
+	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
+
+	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
+	vpxor	%ymm3, %ymm3, %ymm3
+	vpmaxsd	%ymm3, %ymm2, %ymm2
+	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
 
 	/*
 	 *  small table specific variables *
 	 *  Constant loading
 	 */
-	vmovups	_iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
-	vpsubd	_iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
-
-	/* if VMIN, VMAX is defined for I type */
-	vxorps	%ymm15, %ymm15, %ymm15
-	vpcmpgtd %ymm15, %ymm9, %ymm0
-	vpand	%ymm0, %ymm9, %ymm7
-	vpcmpgtd %ymm8, %ymm9, %ymm6
-	vblendvps %ymm6, %ymm8, %ymm7, %ymm3
-	vpsrld	$14, %ymm3, %ymm1
-	vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
-	vmovmskps %ymm13, %r11d
-	vandps	_sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
-	vandps	_sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
-	vextractf128 $1, %ymm1, %xmm2
-	vmovd	%xmm1, %r9d
-	vmovd	%xmm2, %ecx
-	vpextrd	$1, %xmm2, %edx
-	vpextrd	$1, %xmm1, %r8d
-	movslq	%r9d, %r9
-	movslq	%edx, %rdx
-	movslq	%r8d, %r8
-	vpextrd	$2, %xmm1, %edi
-	movslq	%ecx, %rcx
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	vpextrd	$3, %xmm2, %r12d
-	vpextrd	$3, %xmm1, %esi
-	vpextrd	$2, %xmm2, %eax
-	movslq	%edi, %rdi
-	movslq	%r12d, %r12
-	movslq	%esi, %rsi
-	movslq	%eax, %rax
-	vmovupd	-16(%r9, %r10), %xmm5
-	vmovupd	-16(%rdx, %r10), %xmm14
-	vmovupd	-16(%rcx, %r10), %xmm13
-	vmovupd	(%r9, %r10), %xmm1
-	vmovupd	(%r8, %r10), %xmm2
-	vmovupd	-16(%r8, %r10), %xmm4
-	vinsertf128 $1, -16(%rdi, %r10), %ymm5, %ymm15
-	vinsertf128 $1, -16(%r12, %r10), %ymm14, %ymm3
-	vinsertf128 $1, -16(%rax, %r10), %ymm13, %ymm6
-	vinsertf128 $1, (%rdi, %r10), %ymm1, %ymm5
-	vinsertf128 $1, (%rsi, %r10), %ymm2, %ymm14
-	vunpcklpd %ymm3, %ymm6, %ymm8
+	vpsrld	$14, %ymm2, %ymm1
+
+	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
+	   store/load as we can take advantage of store-forwarding.  */
+	vmovq	%xmm1, %r8
+	/* We have eliminated all negative values for ymm1 so no need to sign
+	   extend.  */
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+
+	/* Store base of lookup table in rax.  */
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+
+	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
+	   with memory operand. This helps alleviate bottleneck on p5.  */
+	vmovupd	16(%r9, %rax), %xmm5
+
+	vpextrq	$1, %xmm1, %rsi
+	movl	%esi, %edi
+	shrq	$32, %rsi
+
+	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
+
+	vextracti128 $1, %ymm1, %xmm2
+	vmovq	%xmm2, %rdx
+	movl	%edx, %ecx
+	shrq	$32, %rdx
+
+	vmovupd	(%rcx, %rax), %xmm6
+
+	vpextrq	$1, %xmm2, %r10
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+
+	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
+
+	vmovupd	16(%r8, %rax), %xmm1
+	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
+	vmovupd	(%rdx, %rax), %xmm3
+	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm6, %ymm7
 	vunpckhpd %ymm3, %ymm6, %ymm6
-	vunpcklpd %ymm14, %ymm5, %ymm3
-	vunpckhpd %ymm14, %ymm5, %ymm2
-	vmovupd	(%rcx, %r10), %xmm13
-	vcvtps2pd %xmm10, %ymm5
-	vextractf128 $1, %ymm10, %xmm10
-	vfmadd213pd %ymm3, %ymm5, %ymm2
-	vinsertf128 $1, -16(%rsi, %r10), %ymm4, %ymm0
-	vmovupd	(%rdx, %r10), %xmm4
-	vunpcklpd %ymm0, %ymm15, %ymm9
-	vunpckhpd %ymm0, %ymm15, %ymm7
-	vfmadd213pd %ymm7, %ymm5, %ymm2
-	vfmadd213pd %ymm9, %ymm5, %ymm2
-	vinsertf128 $1, (%r12, %r10), %ymm4, %ymm0
-	vcvtps2pd %xmm10, %ymm4
-	vinsertf128 $1, (%rax, %r10), %ymm13, %ymm15
-	vunpcklpd %ymm0, %ymm15, %ymm1
-	vunpckhpd %ymm0, %ymm15, %ymm0
-	vfmadd213pd %ymm1, %ymm4, %ymm0
-	vcvtpd2ps %ymm2, %xmm1
-	vfmadd213pd %ymm6, %ymm4, %ymm0
-	vfmadd213pd %ymm8, %ymm4, %ymm0
-	vcvtpd2ps %ymm0, %xmm0
-	vinsertf128 $1, %xmm0, %ymm1, %ymm2
-	vorps	%ymm11, %ymm2, %ymm0
-	testl	%r11d, %r11d
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r13 r14 r15 r11d ymm0 ymm12
+	vunpcklpd %ymm1, %ymm5, %ymm3
+	vunpckhpd %ymm1, %ymm5, %ymm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
+	/* Store special cases in ymm15.  */
+	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
 
-L(EXIT):
-	addq	$120, %rsp
-	cfi_restore(12)
-	popq	%r12
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
+	vandps	%ymm11, %ymm0, %ymm4
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vcvtps2pd %xmm4, %ymm5
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm12, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r13 r14 r15 r11d ymm0
+	vextractf128 $1, %ymm4, %xmm4
+	vcvtps2pd %xmm4, %ymm4
 
-	xorl	%r12d, %r12d
-	# LOE rbx r13 r14 r15 r11d r12d
+	vmovupd	16(%rcx, %rax), %xmm2
+	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
 
-	vzeroupper
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	movl	%r11d, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	vfmadd213pd %ymm3, %ymm5, %ymm1
+
+	vmovupd	16(%rdx, %rax), %xmm3
+	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm2, %ymm10
+	vunpckhpd %ymm3, %ymm2, %ymm2
+
+	vfmadd213pd %ymm10, %ymm4, %ymm2
+	vfmadd213pd %ymm6, %ymm4, %ymm2
+	vfmadd213pd %ymm7, %ymm4, %ymm2
+	vcvtpd2ps %ymm2, %xmm2
+
+	vmovupd	(%r9, %rax), %xmm7
+	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
+
+	vmovupd	(%r8, %rax), %xmm3
+	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
+
+	vunpckhpd %ymm3, %ymm7, %ymm4
+	vunpcklpd %ymm3, %ymm7, %ymm7
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	vfmadd213pd %ymm4, %ymm5, %ymm1
+	vfmadd213pd %ymm7, %ymm5, %ymm1
+
+
+	vcvtpd2ps %ymm1, %xmm1
+	vinsertf128 $1, %xmm2, %ymm1, %ymm1
+
+	vmovmskps %ymm15, %edx
+	vandnps	%ymm0, %ymm11, %ymm2
+	testl	%edx, %edx
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
+	/* Wait until after branch of write over ymm0.  */
+	vorps	%ymm2, %ymm1, %ymm0
+	/* No stack restoration on the fastpath.  */
+	ret
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
 
-	/* Special inputs
-	 * processing loop
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   more so than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
+
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save all already computed inputs.  */
+	vorps	%ymm2, %ymm1, %ymm1
+	vmovaps	%ymm1, (%rsp)
+	/* Save original input (ymm0 unchanged up to this point).  */
+	vmovaps	%ymm0, 32(%rsp)
+
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
-END(_ZGVdN8v_tanhf_avx2)
 
-	.section .rodata, "a"
-	.align	32
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 _dbP[(134*4)][2];
-	__declspec(align(32)) VUINT32 _sSignMask[8][1];
-	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMask[8][1];
-	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
-	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	32
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
-	.align	32
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
-	.align	32
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
-	.align	32
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
-	.align	32
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
-	.align	32
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+	/* All results have been written to 32(%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
+END(_ZGVdN8v_tanhf_avx2)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-06-09 16:58   ` [PATCH v3 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 16:58   ` Noah Goldstein
  2022-06-09 17:10     ` H.J. Lu
  2022-06-09 17:04   ` [PATCH v3 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
  6 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 16:58 UTC (permalink / raw)
  To: libc-alpha

Optimizations are:
    1. Reduce code size (-112 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata size (-4k+ rodata is shared with avx2).

Result is roughly a 15-16% speedup:

       Function, New Time, Old Time, New / Old
 _ZGVbN4v_tanhf,    3.158,    3.749,     0.842
---
 .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 865 +++---------------
 1 file changed, 138 insertions(+), 727 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
index 532ebbac65..bf7687d8ba 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
@@ -70,761 +70,172 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP				0
-#define _sSignMask			4288
-#define _sAbsMask			4304
-#define _iExpMantMask			4320
-#define _iExpMask			4336
-#define _iMinIdxOfsMask			4352
-#define _iMaxIdxMask			4368
 
 #include <sysdep.h>
 
+/* tanhf data tables for avx2 and sse4 implementatins defined here.
+ */
+#define ONLY_DECL_OFFSET
+#include "svml_s_tanhf_rodata.S"
+
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_tanhf_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
-	movaps	%xmm0, %xmm5
+	/* Save copy of input in xmm12.  */
+	movaps	%xmm0, %xmm12
 
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	movdqu	_iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
-	lea	_dbP+16+__svml_stanh_data_internal(%rip), %r8
-	pand	%xmm5, %xmm9
+	movdqu	TANHF_DATA(_iExpMantMask)(%rip), %xmm3
+	pand	%xmm0, %xmm3
 
-	/* if VMIN, VMAX is defined for I type */
+
+	/* Selection of arguments between [0, 0x04280000] into xmm3.  */
 	pxor	%xmm7, %xmm7
-	movdqa	%xmm9, %xmm6
-	psubd	_iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
+	/* Save xmm3 for special values check at end.  */
+	movdqa	%xmm3, %xmm8
+	psubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
+	pmaxsd	%xmm7, %xmm3
+	pminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
+	psrld	$14, %xmm3
+
+	movq	%xmm3, %rcx
+	movl	%ecx, %edx
+	shrq	$32, %rcx
+
+	pshufd	$0x0e, %xmm3, %xmm3
+	movq	%xmm3, %rdi
+	movl	%edi, %esi
+	shrq	$32, %rdi
+
+	movaps	TANHF_DATA(_sAbsMask)(%rip), %xmm1
+	andps	%xmm1, %xmm0
+
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+	movups	(%rdx, %rax), %xmm2
+	movups	(%rcx, %rax), %xmm6
 
 	/*
 	 *  small table specific variables *
 	 *  Constant loading
 	 */
-	movdqu	_iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
-	movdqa	%xmm9, %xmm11
-	movdqa	%xmm9, %xmm8
-	pcmpgtd	%xmm10, %xmm11
-	pcmpgtd	%xmm7, %xmm8
-	movdqa	%xmm11, %xmm14
-	pand	%xmm8, %xmm9
-	andps	%xmm11, %xmm10
-	andnps	%xmm9, %xmm14
-	orps	%xmm10, %xmm14
-	psrld	$14, %xmm14
-	movd	%xmm14, %edx
-	pshufd	$1, %xmm14, %xmm12
-	pshufd	$2, %xmm14, %xmm13
-	movd	%xmm12, %ecx
-	pshufd	$3, %xmm14, %xmm15
-	movups	_sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
-	movslq	%edx, %rdx
-	andps	%xmm5, %xmm3
-	movslq	%ecx, %rcx
-	pcmpgtd	_iExpMask+__svml_stanh_data_internal(%rip), %xmm6
-	movd	%xmm13, %esi
-	movups	-16(%rdx, %r8), %xmm2
-	movaps	%xmm2, %xmm0
-	movd	%xmm15, %edi
-	movmskps %xmm6, %eax
-	movups	-16(%rcx, %r8), %xmm6
-	unpcklpd %xmm6, %xmm0
+	movaps	%xmm2, %xmm4
+	movlhps	%xmm6, %xmm4
 	unpckhpd %xmm6, %xmm2
-	cvtps2pd %xmm3, %xmm6
-	movhlps	%xmm3, %xmm3
-	cvtps2pd %xmm3, %xmm3
-	movslq	%esi, %rsi
-	movslq	%edi, %rdi
-	movups	(%rcx, %r8), %xmm8
-	movups	(%rdx, %r8), %xmm12
-	movups	(%rsi, %r8), %xmm13
-	movaps	%xmm12, %xmm10
-	movups	(%rdi, %r8), %xmm9
+
+	cvtps2pd %xmm0, %xmm6
+	movhlps	%xmm0, %xmm0
+	cvtps2pd %xmm0, %xmm0
+
+	movups	16(%rdx, %rax), %xmm5
+	movups	16(%rsi, %rax), %xmm13
+
+	movaps	%xmm5, %xmm10
 	movaps	%xmm13, %xmm11
-	unpckhpd %xmm8, %xmm12
-	unpckhpd %xmm9, %xmm13
-	mulpd	%xmm6, %xmm12
-	mulpd	%xmm3, %xmm13
-	unpcklpd %xmm8, %xmm10
-	unpcklpd %xmm9, %xmm11
-	addpd	%xmm10, %xmm12
+
+	movups	16(%rcx, %rax), %xmm7
+	movups	16(%rdi, %rax), %xmm3
+
+	unpckhpd %xmm7, %xmm5
+	unpckhpd %xmm3, %xmm13
+
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
+
+	movlhps	%xmm7, %xmm10
+	movlhps	%xmm3, %xmm11
+
+	addpd	%xmm10, %xmm5
 	addpd	%xmm11, %xmm13
-	mulpd	%xmm6, %xmm12
-	mulpd	%xmm3, %xmm13
-	addpd	%xmm2, %xmm12
-	movups	-16(%rsi, %r8), %xmm1
-	movups	-16(%rdi, %r8), %xmm7
-	movaps	%xmm1, %xmm14
-	unpckhpd %xmm7, %xmm1
-	addpd	%xmm1, %xmm13
-	mulpd	%xmm12, %xmm6
-	mulpd	%xmm13, %xmm3
-	addpd	%xmm0, %xmm6
-	unpcklpd %xmm7, %xmm14
-	addpd	%xmm14, %xmm3
-	cvtpd2ps %xmm6, %xmm0
-	cvtpd2ps %xmm3, %xmm1
-	movups	_sSignMask+__svml_stanh_data_internal(%rip), %xmm4
-	movlhps	%xmm1, %xmm0
-	andps	%xmm5, %xmm4
-	orps	%xmm4, %xmm0
-	testl	%eax, %eax
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	addpd	%xmm2, %xmm5
 
-L(EXIT):
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
-	ret
-	cfi_def_cfa_offset(80)
+	movups	(%rsi, %rax), %xmm2
+	movups	(%rdi, %rax), %xmm7
 
-	/* Branch to process
-	 * special inputs
-	 */
+	movaps	%xmm2, %xmm3
 
-L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm5, 32(%rsp)
-	movups	%xmm0, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 eax
-
-	xorl	%edx, %edx
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%edx, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%eax, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	unpckhpd %xmm7, %xmm2
+	movlhps	%xmm7, %xmm3
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	addpd	%xmm13, %xmm2
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
+	mulpd	%xmm5, %xmm6
+	addpd	%xmm4, %xmm6
 
-	/* Special inputs
-	 * processing loop
-	 */
+	mulpd	%xmm2, %xmm0
+	addpd	%xmm3, %xmm0
 
-L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	cvtpd2ps %xmm0, %xmm2
+	cvtpd2ps %xmm6, %xmm0
 
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	movlhps	%xmm2, %xmm0
+	andnps	%xmm12, %xmm1
+	orps	%xmm1, %xmm0
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	call	tanhf@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
+	/* xmm8 contains mask of special values.  */
+	pcmpgtd	TANHF_DATA(_iExpMask)(%rip), %xmm8
 
-	movss	%xmm0, 48(%rsp, %r14, 4)
+	movmskps %xmm8, %edx
+	testl	%edx, %edx
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
-END(_ZGVbN4v_tanhf_sse4)
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx rbp r12 r13 r14 r15 xmm0
+	/* No stack restoration on the fastpath.  */
+	ret
 
-	.section .rodata, "a"
-	.align	16
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 _dbP[(134*4)][2];
-	__declspec(align(16)) VUINT32 _sSignMask[4][1];
-	__declspec(align(16)) VUINT32 _sAbsMask[4][1];
-	__declspec(align(16)) VUINT32 _iExpMantMask[4][1];
-	__declspec(align(16)) VUINT32 _iExpMask[4][1];
-	__declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
-	__declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.align	16
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	16
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
-	.align	16
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
-	.align	16
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
-	.align	16
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
-	.align	16
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
-	.align	16
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   more so than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+       call entry will be 16-byte aligned. */
+	subq	$56, %rsp
+	cfi_def_cfa_offset(64)
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm12, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+       encoding for many instructions (as compared with r12/r13). */
+	movq	%rbx, (%rsp)
+	cfi_offset(rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset(rbp, -56)
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
+L(SPECIAL_VALUES_LOOP):
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
+	call	tanhf@PLT
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+	/* All results have been written to 24(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore(rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore(rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset(8)
+	ret
+END(_ZGVbN4v_tanhf_sse4)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
                     ` (5 preceding siblings ...)
  2022-06-09 16:58   ` [PATCH v3 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09 17:04   ` H.J. Lu
  6 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 17:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:58 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvementss are:

Improvements.

>     1. Reduce code size (-64 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Reduce rodata size ([-128, -188] bytes).
>
> The throughput improvement is not significant as the port 0 bottleneck
> is unavoidable.
>
>         Function, New Time, Old Time, New / Old
> _ZGVeN16v_atanhf,     1.39,    1.408,     0.987
> ---
>  .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
>  1 file changed, 244 insertions(+), 230 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> index a1cd920a0f..f42462c581 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> @@ -31,53 +31,50 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal_avx512
> - */
> -#define Log_tbl_H                      0
> -#define Log_tbl_L                      128
> -#define One                            256
> -#define AbsMask                                320
> -#define AddB5                          384
> -#define RcpBitMask                     448
> -#define poly_coeff3                    512
> -#define poly_coeff2                    576
> -#define poly_coeff1                    640
> -#define poly_coeff0                    704
> -#define Half                           768
> -#define L2H                            832
> -#define L2L                            896
> +/* Offsets for data table __svml_satanh_data_internal_avx512 and
> +   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
> +   function. On cold-starts this might help the prefetcher. Possibly
> +   a better idea is to interleave start/end so that the prefetcher is
> +   less likely to detect a stream and pull irrelivant lines into
> +   cache.  */
> +
> +/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
> +   the memory is broadcast to {1to16}.  */
> +#define AbsMask                                0
> +
> +/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
> +   is used here.  */
> +#define One                            0
> +#define AddB5                          64
> +#define RcpBitMask                     128
> +#define Log_tbl_L_lo                   192
> +#define Log_tbl_L_hi                   256
> +#define Log_tbl_H_lo                   320
> +#define Log_tbl_H_hi                   384
> +#define L2H                            448
> +#define L2L                            512
> +#define poly_coeff3                    576
> +#define poly_coeff2                    640
> +#define poly_coeff1                    704
>
>  #include <sysdep.h>
>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal_avx512_al64)
> +
>         .section .text.exex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_atanhf_skx)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-64, %rsp
> -       subq    $192, %rsp
> -       vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> -
> -       /* round reciprocals to 1+5b mantissas */
> -       vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> -       vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> -       vmovaps %zmm0, %zmm11
> -       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> +       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
> +       vmovups ATANHF_DATA(One)(%rip), %zmm4
>
>         /* 1+y */
>         vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
>
>         /* 1-y */
>         vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> -       vxorps  %zmm6, %zmm11, %zmm10
> -
> -       /* Yp_high */
> -       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
>
> -       /* -Ym_high */
> -       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +       /* round reciprocals to 1+5b mantissas */
> +       vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
> +       vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
>
>         /* RcpP ~ 1/Yp */
>         vrcp14ps %zmm9, %zmm12
> @@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
>         /* RcpM ~ 1/Ym */
>         vrcp14ps %zmm8, %zmm13
>
> +       /* Yp_high */
> +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> +
> +       /* -Ym_high */
> +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +
> +
>         /* input outside (-1, 1) ? */
> -       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
>         vpaddd  %zmm14, %zmm12, %zmm15
> -       vpaddd  %zmm14, %zmm13, %zmm0
> +       vpaddd  %zmm14, %zmm13, %zmm12
>
>         /* Yp_low */
>         vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
>         vandps  %zmm1, %zmm15, %zmm7
> -       vandps  %zmm1, %zmm0, %zmm12
> +       vandps  %zmm1, %zmm12, %zmm12
>
>         /* Ym_low */
>         vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> @@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
>         vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
>
>         /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> -       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> -       vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> -       vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> +       vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
> +
> +       vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
> +       vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
>
>         /* exponents */
> -       vgetexpps {sae}, %zmm7, %zmm15
>         vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> +       vgetexpps {sae}, %zmm7, %zmm15
> +
>
>         /* Table lookups */
> -       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
>         vgetexpps {sae}, %zmm12, %zmm14
> -       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> +
>
>         /* Prepare table index */
>         vpsrld  $18, %zmm7, %zmm3
>         vpsrld  $18, %zmm12, %zmm2
> -       vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -       vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> -
> +       vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
> +       vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
>         /* Km-Kp */
> +
> +       vmovaps %zmm3, %zmm5
> +       vpermi2ps %zmm13, %zmm10, %zmm3
> +       vpermt2ps %zmm13, %zmm2, %zmm10
> +       vpermi2ps %zmm7, %zmm11, %zmm5
> +       vpermt2ps %zmm7, %zmm2, %zmm11
>         vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> -       kmovw   %k0, %edx
> -       vmovaps %zmm3, %zmm0
> -       vpermi2ps %zmm13, %zmm8, %zmm3
> -       vpermt2ps %zmm13, %zmm2, %zmm8
> -       vpermi2ps %zmm7, %zmm6, %zmm0
> -       vpermt2ps %zmm7, %zmm2, %zmm6
> -       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> +       vsubps  {rn-sae}, %zmm3, %zmm10, %zmm7
>
>         /* K*L2H + Th */
> -       vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> +       vmovups ATANHF_DATA(L2H)(%rip), %zmm2
>
>         /* K*L2L + Tl */
> -       vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -
> -       /* polynomials */
> -       vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -       vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> +       vmovups ATANHF_DATA(L2L)(%rip), %zmm3
>
>         /* table values */
> -       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> -       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> -       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> -       vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -       vmovaps %zmm3, %zmm2
> -       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> -       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> +       vsubps  {rn-sae}, %zmm5, %zmm11, %zmm5
> +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
> +       vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
> +       /* polynomials */
> +       vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
> +       vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
> +       vmovaps %zmm10, %zmm14
> +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
> +       vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
> +       vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
>
>         /* (K*L2L + Tl) + Rp*PolyP */
> -       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> -       vorps   Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
> +
> +       /* zmm12 = zmm12 & (zmm4 | zmm0).  */
> +       vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
>
>         /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> -       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> -       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> -       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> +       vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
> +       vaddps  {rn-sae}, %zmm14, %zmm10, %zmm8
> +
> +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> +       kmovw   %k0, %edx
>         testl   %edx, %edx
>
>         /* Go to special inputs processing branch */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> +       # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm0
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> -
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> +       /* No register to restore on fast path.  */
>         ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          more so than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       vmovups %zmm11, 64(%rsp)
> -       vmovups %zmm0, 128(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0
> -
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> -
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> -
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Special inputs
> -        * processing loop
> -        */
> +       /* Align stack and make room for 2x zmm vectors.  */
> +       andq    $-64, %rsp
> +       addq    $-128, %rsp
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm1
> +       vmovaps %zmm1, (%rsp)
> +       vmovaps %zmm0, 64(%rsp)
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a atanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $16, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 128(%rsp), %zmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 zmm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          atanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   64(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
> -
> -       movss   %xmm0, 128(%rsp, %r14, 4)
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
> +       /* All results have been written to (%rsp).  */
> +       vmovaps (%rsp), %zmm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVeN16v_atanhf_skx)
>
>         .section .rodata, "a"
> -       .align  64
> -
> +       .align  4
>  #ifdef __svml_satanh_data_internal_avx512_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> -       __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> +typedef struct{
> +       __declspec(align(4)) VUINT32 AbsMask[1][1];
>         __declspec(align(64)) VUINT32 One[16][1];
> -       __declspec(align(64)) VUINT32 AbsMask[16][1];
>         __declspec(align(64)) VUINT32 AddB5[16][1];
>         __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
> +       __declspec(align(64)) VUINT32 L2H[16][1];
> +       __declspec(align(64)) VUINT32 L2L[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff3[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff2[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> -       __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> -       __declspec(align(64)) VUINT32 Half[16][1];
> -       __declspec(align(64)) VUINT32 L2H[16][1];
> -       __declspec(align(64)) VUINT32 L2L[16][1];
>  } __svml_satanh_data_internal_avx512;
>  #endif
>  __svml_satanh_data_internal_avx512:
> -       /* Log_tbl_H */
> -       .long   0x00000000
> -       .long   0x3cfc0000
> -       .long   0x3d780000
> -       .long   0x3db78000
> -       .long   0x3df10000
> -       .long   0x3e14c000
> -       .long   0x3e300000
> -       .long   0x3e4a8000
> -       .long   0x3e648000
> -       .long   0x3e7dc000
> -       .long   0x3e8b4000
> -       .long   0x3e974000
> -       .long   0x3ea30000
> -       .long   0x3eae8000
> -       .long   0x3eb9c000
> -       .long   0x3ec4e000
> -       .long   0x3ecfa000
> -       .long   0x3eda2000
> -       .long   0x3ee48000
> -       .long   0x3eeea000
> -       .long   0x3ef8a000
> -       .long   0x3f013000
> -       .long   0x3f05f000
> -       .long   0x3f0aa000
> -       .long   0x3f0f4000
> -       .long   0x3f13d000
> -       .long   0x3f184000
> -       .long   0x3f1ca000
> -       .long   0x3f20f000
> -       .long   0x3f252000
> -       .long   0x3f295000
> -       .long   0x3f2d7000
> -       /* Log_tbl_L */
> +       /* Leave this at front so we can potentially save space due to
> +          smaller alignment constraint.  */
> +       .align  4
> +    /* AbsMask */
> +       .long   0x7fffffff
> +       .align  64
> +__svml_satanh_data_internal_avx512_al64:
> +       /* One */
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* AddB5 */
> +       .align  64
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       /* RcpBitMask */
> +       .align  64
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       /* Log_tbl_L_lo */
>         .align  64
>         .long   0x00000000
>         .long   0x3726c39e
> @@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
>         .long   0x38dedfac
>         .long   0x38ebfb5e
>         .long   0xb8e63c9f
> +       /* Log_tbl_L_hi */
> +       .align  64
>         .long   0xb85c1340
>         .long   0x38777bcd
>         .long   0xb6038656
> @@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
>         .long   0x38f85db0
>         .long   0x37b4996f
>         .long   0xb8bfb3ca
> -       /* One */
> +       /* Log_tbl_H_lo */
>         .align  64
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* AbsMask */
> +       .long   0x00000000
> +       .long   0x3cfc0000
> +       .long   0x3d780000
> +       .long   0x3db78000
> +       .long   0x3df10000
> +       .long   0x3e14c000
> +       .long   0x3e300000
> +       .long   0x3e4a8000
> +       .long   0x3e648000
> +       .long   0x3e7dc000
> +       .long   0x3e8b4000
> +       .long   0x3e974000
> +       .long   0x3ea30000
> +       .long   0x3eae8000
> +       .long   0x3eb9c000
> +       .long   0x3ec4e000
> +       /* Log_tbl_H_hi */
>         .align  64
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> -       /* AddB5 */
> +       .long   0x3ecfa000
> +       .long   0x3eda2000
> +       .long   0x3ee48000
> +       .long   0x3eeea000
> +       .long   0x3ef8a000
> +       .long   0x3f013000
> +       .long   0x3f05f000
> +       .long   0x3f0aa000
> +       .long   0x3f0f4000
> +       .long   0x3f13d000
> +       .long   0x3f184000
> +       .long   0x3f1ca000
> +       .long   0x3f20f000
> +       .long   0x3f252000
> +       .long   0x3f295000
> +       .long   0x3f2d7000
> +       /* L2H = log(2)_high */
>         .align  64
> -       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> -       /* RcpBitMask */
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       /* L2L = log(2)_low */
>         .align  64
> -       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
>         /* poly_coeff3 */
>         .align  64
> -       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
>         /* poly_coeff2 */
>         .align  64
> -       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
>         /* poly_coeff1 */
>         .align  64
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> -       /* poly_coeff0 */
> -       .align  64
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* Half */
> -       .align  64
> -       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> -       /* L2H = log(2)_high */
> -       .align  64
> -       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> -       /* L2L = log(2)_low */
> -       .align  64
> -       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
>         .align  64
> +       .type   __svml_satanh_data_internal_avx512_al64, @object
> +       .size   __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
>         .type   __svml_satanh_data_internal_avx512, @object
>         .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> --
> 2.34.1
>

OK with commit log type fix.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S
  2022-06-09 16:58   ` [PATCH v3 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 17:05     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 17:05 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:58 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvements are:
>     1. Reduce code size (-60 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Shrink rodata usage (-32 bytes).
>
> The throughput improvement is not that significant (3-5%) as the
> port 0 bottleneck is unavoidable.
>
>        Function, New Time, Old Time, New / Old
> _ZGVdN8v_atanhf,    2.799,    2.923,     0.958
> ---
>  .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 405 +++++++++---------
>  1 file changed, 202 insertions(+), 203 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> index c1ea1c3353..43eb423831 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> @@ -30,305 +30,304 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> +/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
> +   by use in the function. On cold-starts this might hhelp the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
>  #define SgnMask                                0
>  #define sOne                           32
> -#define sPoly                          64
> -#define iBrkValue                      320
> -#define iOffExpoMask                   352
> -#define sHalf                          384
> -#define sSign                          416
> -#define sTopMask12                     448
> -#define TinyRange                      480
> -#define sLn2                           512
> +#define sTopMask12                     64
> +#define TinyRange                      96
> +#define iBrkValue                      128
> +#define iOffExpoMask                   160
> +#define sPoly                          192
> +#define sLn2                           448
> +#define sHalf                          480
>
>  #include <sysdep.h>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal)
>
>         .section .text.avx2, "ax", @progbits
>  ENTRY(_ZGVdN8v_atanhf_avx2)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-32, %rsp
> -       subq    $96, %rsp
> -
> +       /* Strip off the sign, so treat X as positive until right at the end */
> +       vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2
> +       vandps  %ymm2, %ymm0, %ymm3
>         /* Load constants including One = 1 */
> -       vmovups sOne+__svml_satanh_data_internal(%rip), %ymm5
> -       vmovups sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
> -       vmovaps %ymm0, %ymm6
> +       vmovups ATANHF_DATA(sOne)(%rip), %ymm5
> +       vsubps  %ymm3, %ymm5, %ymm1
> +       vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4
>
> -       /* Strip off the sign, so treat X as positive until right at the end */
> -       vandps  SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
> -       vsubps  %ymm10, %ymm5, %ymm1
> +       vrcpps  %ymm1, %ymm7
> +       vsubps  %ymm1, %ymm5, %ymm9
> +       vandps  %ymm4, %ymm7, %ymm6
> +       vsubps  %ymm3, %ymm9, %ymm7
>
> -       /*
> -        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> -        * the upper part UHi being <= 12 bits long. Then we have
> -        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> -        */
> -       vaddps  %ymm10, %ymm10, %ymm14
> +       /* No need to split sU when FMA is available */
> +       vfnmadd213ps %ymm5, %ymm6, %ymm1
> +       vmovaps %ymm0, %ymm8
> +       vfmadd213ps %ymm0, %ymm0, %ymm0
> +       vfnmadd231ps %ymm6, %ymm7, %ymm1
>
>         /*
>          * Check whether |X| < 1, in which case we use the main function.
>          * Otherwise set the rangemask so that the callout will get used.
>          * Note that this will also use the callout for NaNs since not(NaN < 1).
>          */
> -       vcmpnlt_uqps %ymm5, %ymm10, %ymm7
> -       vsubps  %ymm1, %ymm5, %ymm9
> -       vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
> -       vrcpps  %ymm1, %ymm11
> -       vsubps  %ymm10, %ymm9, %ymm12
> -       vandps  %ymm13, %ymm11, %ymm0
> +       vcmpnlt_uqps %ymm5, %ymm3, %ymm14
> +       vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
>
> -       /* No need to split sU when FMA is available */
> -       vfnmadd213ps %ymm5, %ymm0, %ymm1
> -       vmovaps %ymm6, %ymm8
> -       vfmadd213ps %ymm6, %ymm6, %ymm8
> -       vfnmadd231ps %ymm0, %ymm12, %ymm1
> +       /*
> +        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> +        * the upper part UHi being <= 12 bits long. Then we have
> +        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> +        */
> +       vaddps  %ymm3, %ymm3, %ymm3
>
>         /*
>          * Split V as well into upper 12 bits and lower part, so that we can get
>          * a preliminary quotient estimate without rounding error.
>          */
> -       vandps  %ymm13, %ymm14, %ymm15
> -       vmovmskps %ymm7, %edx
> -       vsubps  %ymm15, %ymm14, %ymm7
> +       vandps  %ymm4, %ymm3, %ymm4
> +       vsubps  %ymm4, %ymm3, %ymm7
>
>         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> -       vmulps  %ymm15, %ymm0, %ymm10
> +       vmulps  %ymm4, %ymm6, %ymm4
>
>         /* Compute D = E + E^2 */
>         vfmadd213ps %ymm1, %ymm1, %ymm1
>
> -       /* Record the sign for eventual reincorporation. */
> -       vandps  sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
> +       /* Record the sign for eventual reincorporation.  */
> +       vandnps %ymm8, %ymm2, %ymm3
>
>         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> -       vorps   %ymm3, %ymm8, %ymm2
> -       vmulps  %ymm7, %ymm0, %ymm8
> +       vorps   %ymm3, %ymm0, %ymm13
> +       vmulps  %ymm7, %ymm6, %ymm2
>
>         /*
>          * Compute R * (VHi + VLo) * (1 + E + E^2)
>          * = R *  (VHi + VLo) * (1 + D)
>          * = QHi + (QHi * D + QLo + QLo * D)
>          */
> -       vmulps  %ymm1, %ymm10, %ymm9
> -       vfmadd213ps %ymm8, %ymm8, %ymm1
> -       vaddps  %ymm1, %ymm9, %ymm1
>
> -       /* reduction: compute r, n */
> -       vmovups iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
> +       /*
> +        * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
> +        * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
> +        * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
> +        */
> +       vmulps  %ymm1, %ymm4, %ymm6
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
> +       vaddps  %ymm1, %ymm6, %ymm1
>
>         /*
>          * Now finally accumulate the high and low parts of the
>          * argument to log1p, H + L, with a final compensated summation.
>          */
> -       vaddps  %ymm1, %ymm10, %ymm12
> -       vsubps  %ymm12, %ymm10, %ymm11
> +       vaddps  %ymm1, %ymm4, %ymm2
> +
> +       /* reduction: compute r, n */
> +       vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9
>
>         /*
>          * Now we feed into the log1p code, using H in place of _VARG1 and
>          * later incorporating L into the reduced argument.
>          * compute 1+x as high, low parts
>          */
> -       vmaxps  %ymm12, %ymm5, %ymm13
> -       vminps  %ymm12, %ymm5, %ymm14
> -       vaddps  %ymm11, %ymm1, %ymm0
> -       vaddps  %ymm14, %ymm13, %ymm1
> -       vpsubd  %ymm9, %ymm1, %ymm7
> -       vsubps  %ymm1, %ymm13, %ymm15
> -       vpsrad  $23, %ymm7, %ymm10
> -       vpand   iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
> -       vaddps  %ymm15, %ymm14, %ymm13
> -       vpslld  $23, %ymm10, %ymm11
> -       vpaddd  %ymm9, %ymm8, %ymm15
> -       vaddps  %ymm13, %ymm0, %ymm14
> -       vcvtdq2ps %ymm10, %ymm0
> -       vpsubd  %ymm11, %ymm5, %ymm12
> +       vmaxps  %ymm2, %ymm5, %ymm0
> +       vminps  %ymm2, %ymm5, %ymm6
> +
> +       /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
> +       vsubps  %ymm2, %ymm4, %ymm2
> +       vaddps  %ymm6, %ymm0, %ymm4
> +       vpsubd  %ymm9, %ymm4, %ymm7
> +       vsubps  %ymm4, %ymm0, %ymm4
> +       vaddps  %ymm2, %ymm1, %ymm2
> +       vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
> +
> +       vandps  %ymm1, %ymm7, %ymm0
> +       vaddps  %ymm4, %ymm6, %ymm4
> +       vandnps %ymm7, %ymm1, %ymm6
> +       vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1
> +       vpaddd  %ymm9, %ymm0, %ymm0
> +       vaddps  %ymm4, %ymm2, %ymm4
> +       vpsubd  %ymm6, %ymm5, %ymm6
>
>         /* polynomial evaluation */
> -       vsubps  %ymm5, %ymm15, %ymm5
> -       vmulps  %ymm14, %ymm12, %ymm1
> -       vaddps  %ymm5, %ymm1, %ymm5
> -       vmovups sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
> -       vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vmulps  %ymm1, %ymm5, %ymm7
> -       vfmadd213ps %ymm5, %ymm5, %ymm7
> +       vsubps  %ymm5, %ymm0, %ymm2
> +       vfmadd231ps %ymm4, %ymm6, %ymm2
> +       vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
> +
> +       vmulps  %ymm1, %ymm2, %ymm1
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
>
>         /* final reconstruction */
> -       vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
> +       vpsrad  $23, %ymm7, %ymm6
> +       vcvtdq2ps %ymm6, %ymm2
> +       vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
>
>         /* Finally, halve the result and reincorporate the sign */
> -       vxorps  sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
> -       vmulps  %ymm0, %ymm3, %ymm0
> -       vblendvps %ymm4, %ymm2, %ymm0, %ymm0
> +       vxorps  ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
> +       vmulps  %ymm2, %ymm3, %ymm2
> +       vmovmskps %ymm14, %edx
>         testl   %edx, %edx
>
> +       vblendvps %ymm15, %ymm13, %ymm2, %ymm0
>         /* Go to special inputs processing branch */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
> -
> -       /* Restore registers
> -        * and exit the function
> -        */
> -
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0
> +       /* No registers to restore on fast path.  */
>         ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
>
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          more so than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       vmovups %ymm6, 32(%rsp)
> -       vmovups %ymm0, 64(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx ymm0
> -
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> -
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       /* Align stack and make room for 2x ymm vectors.  */
> +       andq    $-32, %rsp
> +       addq    $-64, %rsp
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> +       /* Save all already computed inputs.  */
> +       vmovups %ymm0, (%rsp)
> +       /* Save original input (ymm8 unchanged up to this point).  */
> +       vmovups %ymm8, 32(%rsp)
>
> -       /* Special inputs
> -        * processing loop
> -        */
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a atanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $8, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 64(%rsp), %ymm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 ymm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          atanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   32(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       /* All results have been written to (%rsp).  */
> +       vmovups (%rsp), %ymm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVdN8v_atanhf_avx2)
>
>         .section .rodata, "a"
>         .align  32
> -
>  #ifdef __svml_satanh_data_internal_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> +typedef struct{
>         __declspec(align(32)) VUINT32 SgnMask[8][1];
>         __declspec(align(32)) VUINT32 sOne[8][1];
> -       __declspec(align(32)) VUINT32 sPoly[8][8][1];
> -       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> -       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> -       __declspec(align(32)) VUINT32 sHalf[8][1];
> -       __declspec(align(32)) VUINT32 sSign[8][1];
>         __declspec(align(32)) VUINT32 sTopMask12[8][1];
>         __declspec(align(32)) VUINT32 TinyRange[8][1];
> +       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> +       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> +       __declspec(align(32)) VUINT32 sPoly[8][8][1];
>         __declspec(align(32)) VUINT32 sLn2[8][1];
> +       __declspec(align(32)) VUINT32 sHalf[8][1];
>  } __svml_satanh_data_internal;
>  #endif
>  __svml_satanh_data_internal:
>         /* SgnMask */
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
>         /* sOne = SP 1.0 */
>         .align  32
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* sPoly[] = SP polynomial */
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* sTopMask12 */
> +       .align  32
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       /* TinyRange */
>         .align  32
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> -       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> -       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> -       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> -       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> -       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> -       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> -       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
>         /* iBrkValue = SP 2/3 */
>         .align  32
> -       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
>         /* iOffExpoMask = SP significand mask */
>         .align  32
> -       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> -       /* sHalf */
> -       .align  32
> -       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> -       /* sSign */
> -       .align  32
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       /* sTopMask12 */
> -       .align  32
> -       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> -       /* TinyRange */
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       /* sPoly[] = SP polynomial */
>         .align  32
> -       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
>         /* sLn2 = SP ln(2) */
>         .align  32
> -       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       /* sHalf */
> +       .align  32
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
>         .align  32
>         .type   __svml_satanh_data_internal, @object
>         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 3/7] x86: Improve svml_s_atanhf4_core_sse4.S
  2022-06-09 16:58   ` [PATCH v3 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09 17:07     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 17:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:58 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvements are:
>     1. Reduce code size (-62 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Reduce rodata usage (-16 bytes).
>
> The throughput improvement is not significant as the port 0 bottleneck
> is unavoidable.
>
>        Function, New Time, Old Time, New / Old
> _ZGVbN4v_atanhf,    8.821,    8.903,     0.991
> ---
>  .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 378 ++++++++----------
>  1 file changed, 169 insertions(+), 209 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> index 2d3ad2617f..37200b3601 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> @@ -30,96 +30,80 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> -#define SgnMask                                0
> -#define sOne                           16
> -#define sPoly                          32
> -#define iBrkValue                      160
> -#define iOffExpoMask                   176
> -#define sHalf                          192
> -#define sSign                          208
> -#define sTopMask12                     224
> -#define TinyRange                      240
> -#define sLn2                           256
> +/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
> +   by use in the function. On cold-starts this might help the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
> +#define sOne                           0
> +#define SgnMask                                16
> +#define sTopMask12                     32
> +#define iBrkValue                      48
> +#define iOffExpoMask                   64
> +#define sPoly                          80
> +#define sLn2                           208
> +#define TinyRange                      224
>
>  #include <sysdep.h>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal)
>
>         .section .text.sse4, "ax", @progbits
>  ENTRY(_ZGVbN4v_atanhf_sse4)
> -       subq    $72, %rsp
> -       cfi_def_cfa_offset(80)
>         movaps  %xmm0, %xmm5
>
>         /* Load constants including One = 1 */
> -       movups  sOne+__svml_satanh_data_internal(%rip), %xmm4
> +       movups  ATANHF_DATA(sOne)(%rip), %xmm4
>         movaps  %xmm5, %xmm3
>
>         /* Strip off the sign, so treat X as positive until right at the end */
> -       movups  SgnMask+__svml_satanh_data_internal(%rip), %xmm7
> -       movaps  %xmm4, %xmm8
> -       andps   %xmm5, %xmm7
> +       movups  ATANHF_DATA(SgnMask)(%rip), %xmm1
> +       movaps  %xmm4, %xmm2
> +       andps   %xmm1, %xmm0
>         movaps  %xmm4, %xmm10
> -       movups  sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
> +       movups  ATANHF_DATA(sTopMask12)(%rip), %xmm11
>         movaps  %xmm4, %xmm14
>         movaps  %xmm11, %xmm9
>
> +
>         /*
>          * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
>          * the upper part UHi being <= 12 bits long. Then we have
>          * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
>          */
> -       movaps  %xmm7, %xmm12
> +       movaps  %xmm0, %xmm6
> +       mulps   %xmm5, %xmm3
> +       subps   %xmm0, %xmm2
> +       addps   %xmm0, %xmm6
> +       subps   %xmm2, %xmm10
> +       addps   %xmm5, %xmm3
> +       subps   %xmm0, %xmm10
> +       andps   %xmm2, %xmm9
> +
>
>         /*
>          * Check whether |X| < 1, in which case we use the main function.
>          * Otherwise set the rangemask so that the callout will get used.
>          * Note that this will also use the callout for NaNs since not(NaN < 1).
>          */
> -       movaps  %xmm7, %xmm6
> -       movaps  %xmm7, %xmm2
> -       cmpnltps %xmm4, %xmm6
> -       cmpltps TinyRange+__svml_satanh_data_internal(%rip), %xmm2
> -       mulps   %xmm5, %xmm3
> -       subps   %xmm7, %xmm8
> -       addps   %xmm7, %xmm12
> -       movmskps %xmm6, %edx
> -       subps   %xmm8, %xmm10
> -       addps   %xmm5, %xmm3
> -       subps   %xmm7, %xmm10
> -       andps   %xmm8, %xmm9
> +       rcpps   %xmm9, %xmm7
> +       subps   %xmm9, %xmm2
> +       andps   %xmm11, %xmm7
>
> -       /*
> -        * Now we feed into the log1p code, using H in place of _VARG1 and
> -        * later incorporating L into the reduced argument.
> -        * compute 1+x as high, low parts
> -        */
> -       movaps  %xmm4, %xmm7
> -
> -       /*
> -        * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
> -        * The first FMR is exact (we force R to 12 bits just in case it
> -        * isn't already, to make absolutely sure), and since E is ~ 2^-12,
> -        * the rounding error in the other one is acceptable.
> -        */
> -       rcpps   %xmm9, %xmm15
> -       subps   %xmm9, %xmm8
> -       andps   %xmm11, %xmm15
>
>         /*
>          * Split V as well into upper 12 bits and lower part, so that we can get
>          * a preliminary quotient estimate without rounding error.
>          */
> -       andps   %xmm12, %xmm11
> -       mulps   %xmm15, %xmm9
> -       addps   %xmm8, %xmm10
> -       subps   %xmm11, %xmm12
> +       andps   %xmm6, %xmm11
> +       mulps   %xmm7, %xmm9
> +       addps   %xmm2, %xmm10
> +       subps   %xmm11, %xmm6
>
>         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> -       mulps   %xmm15, %xmm11
> -       mulps   %xmm15, %xmm10
> +       mulps   %xmm7, %xmm11
> +       mulps   %xmm7, %xmm10
>         subps   %xmm9, %xmm14
> -       mulps   %xmm12, %xmm15
> +       mulps   %xmm6, %xmm7
>         subps   %xmm10, %xmm14
>
>         /* Compute D = E + E^2 */
> @@ -127,8 +111,8 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
>         movaps  %xmm4, %xmm8
>         mulps   %xmm14, %xmm13
>
> -       /* reduction: compute r, n */
> -       movdqu  iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
> +       /* reduction: compute r,n */
> +       movdqu  ATANHF_DATA(iBrkValue)(%rip), %xmm9
>         addps   %xmm13, %xmm14
>
>         /*
> @@ -136,168 +120,149 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
>          * = R *  (VHi + VLo) * (1 + D)
>          * = QHi + (QHi * D + QLo + QLo * D)
>          */
> -       movaps  %xmm14, %xmm0
> -       mulps   %xmm15, %xmm14
> -       mulps   %xmm11, %xmm0
> -       addps   %xmm14, %xmm15
> -       movdqu  iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
> +       movaps  %xmm14, %xmm2
> +       mulps   %xmm7, %xmm14
> +       mulps   %xmm11, %xmm2
> +       addps   %xmm14, %xmm7
> +       movdqu  ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
>         movaps  %xmm4, %xmm14
>
>         /* Record the sign for eventual reincorporation. */
> -       movups  sSign+__svml_satanh_data_internal(%rip), %xmm1
> -       addps   %xmm15, %xmm0
> +       addps   %xmm7, %xmm2
> +
>
>         /*
>          * Now finally accumulate the high and low parts of the
>          * argument to log1p, H + L, with a final compensated summation.
>          */
> -       movaps  %xmm0, %xmm6
> -       andps   %xmm5, %xmm1
> -
> +       movaps  %xmm2, %xmm6
> +       andnps  %xmm5, %xmm1
> +       movaps  %xmm4, %xmm7
>         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> -       orps    %xmm1, %xmm3
>         addps   %xmm11, %xmm6
>         maxps   %xmm6, %xmm7
>         minps   %xmm6, %xmm8
>         subps   %xmm6, %xmm11
>         movaps  %xmm7, %xmm10
> -       andps   %xmm2, %xmm3
>         addps   %xmm8, %xmm10
> -       addps   %xmm11, %xmm0
> +       addps   %xmm11, %xmm2
>         subps   %xmm10, %xmm7
>         psubd   %xmm9, %xmm10
> -       addps   %xmm7, %xmm8
> +       addps   %xmm8, %xmm7
>         pand    %xmm10, %xmm12
>         psrad   $23, %xmm10
>         cvtdq2ps %xmm10, %xmm13
> -       addps   %xmm8, %xmm0
> +       addps   %xmm7, %xmm2
>
>         /* final reconstruction */
> -       mulps   sLn2+__svml_satanh_data_internal(%rip), %xmm13
>         pslld   $23, %xmm10
>         paddd   %xmm9, %xmm12
>         psubd   %xmm10, %xmm14
>
>         /* polynomial evaluation */
>         subps   %xmm4, %xmm12
> -       mulps   %xmm0, %xmm14
> -       movups  sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
> -       addps   %xmm12, %xmm14
> -       mulps   %xmm14, %xmm0
> +       mulps   %xmm14, %xmm2
> +       movups  ATANHF_DATA(sPoly+0)(%rip), %xmm7
> +       addps   %xmm12, %xmm2
> +       mulps   %xmm2, %xmm7
> +
>
>         /* Finally, halve the result and reincorporate the sign */
> -       movups  sHalf+__svml_satanh_data_internal(%rip), %xmm4
> -       pxor    %xmm1, %xmm4
> -       addps   sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   sPoly+__svml_satanh_data_internal(%rip), %xmm0
> -       mulps   %xmm14, %xmm0
> -       mulps   %xmm14, %xmm0
> -       addps   %xmm0, %xmm14
> -       movaps  %xmm2, %xmm0
> -       addps   %xmm13, %xmm14
> -       mulps   %xmm14, %xmm4
> -       andnps  %xmm4, %xmm0
> -       orps    %xmm3, %xmm0
> -       testl   %edx, %edx
> +       addps   ATANHF_DATA(sPoly+16)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+32)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+48)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+64)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+80)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   ATANHF_DATA(sPoly+96)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       movaps  ATANHF_DATA(sPoly+112)(%rip), %xmm6
> +       addps   %xmm6, %xmm7
> +       mulps   %xmm2, %xmm7
> +       mulps   %xmm2, %xmm7
> +       mulps   ATANHF_DATA(sLn2)(%rip), %xmm13
> +       /* We can build `sHalf` with `sPoly & sOne`.  */
> +       andps   %xmm4, %xmm6
> +       orps    %xmm1, %xmm3
> +       xorps   %xmm6, %xmm1
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
> +       addps   %xmm2, %xmm7
> +       addps   %xmm13, %xmm7
> +       mulps   %xmm7, %xmm1
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       /* Finish check of NaNs.  */
> +       cmpleps %xmm0, %xmm4
> +       movmskps %xmm4, %edx
> +       cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
>
> -L(EXIT):
> -       addq    $72, %rsp
> -       cfi_def_cfa_offset(8)
> +       andps   %xmm0, %xmm3
> +       andnps  %xmm1, %xmm0
> +       orps    %xmm3, %xmm0
> +
> +       testl   %edx, %edx
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       /* No registers to restore on fast path.  */
>         ret
> -       cfi_def_cfa_offset(80)
>
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          more so than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       movups  %xmm5, 32(%rsp)
> -       movups  %xmm0, 48(%rsp)
> -       # LOE rbx rbp r12 r13 r14 r15 edx
> -
> -       xorl    %eax, %eax
> -       movq    %r12, 16(%rsp)
> -       cfi_offset(12, -64)
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       cfi_offset(13, -72)
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> -
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> -
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       /* Special inputs
> -        * processing loop
> -        */
> -
> +       # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
> +       /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
> +       call entry will be 16-byte aligned. */
> +       subq    $56, %rsp
> +       cfi_def_cfa_offset(64)
> +       movups  %xmm0, 24(%rsp)
> +       movups  %xmm5, 40(%rsp)
> +
> +       /* Use rbx/rbp for callee save registers as they get short
> +       encoding for many instructions (as compared with r12/r13). */
> +       movq    %rbx, (%rsp)
> +       cfi_offset(rbx, -64)
> +       movq    %rbp, 8(%rsp)
> +       cfi_offset(rbp, -56)
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $4, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       movups  48(%rsp), %xmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       cfi_offset(12, -64)
> -       cfi_offset(13, -72)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r12 r13 r14 r15 xmm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %ebp, %ebp
> +       bsfl    %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   40(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx rbp r14 r15 r12d r13d xmm0
> -
> -       movss   %xmm0, 48(%rsp, %r14, 4)
> -
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx rbp r15 r12d r13d
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, 24(%rsp, %rbp, 4)
> +
> +       leal    -1(%rbx), %eax
> +       andl    %eax, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +       /* All results have been written to 24(%rsp).  */
> +       movups  24(%rsp), %xmm0
> +       movq    (%rsp), %rbx
> +       cfi_restore(rbx)
> +       movq    8(%rsp), %rbp
> +       cfi_restore(rbp)
> +       addq    $56, %rsp
> +       cfi_def_cfa_offset(8)
> +       ret
>  END(_ZGVbN4v_atanhf_sse4)
>
>         .section .rodata, "a"
> @@ -305,56 +270,51 @@ END(_ZGVbN4v_atanhf_sse4)
>
>  #ifdef __svml_satanh_data_internal_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(16)) VUINT32 SgnMask[4][1];
> +typedef struct{
>         __declspec(align(16)) VUINT32 sOne[4][1];
> -       __declspec(align(16)) VUINT32 sPoly[8][4][1];
> +       __declspec(align(16)) VUINT32 SgnMask[4][1];
> +       __declspec(align(16)) VUINT32 sTopMask12[4][1];
>         __declspec(align(16)) VUINT32 iBrkValue[4][1];
>         __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
> -       __declspec(align(16)) VUINT32 sHalf[4][1];
> -       __declspec(align(16)) VUINT32 sSign[4][1];
> -       __declspec(align(16)) VUINT32 sTopMask12[4][1];
> -       __declspec(align(16)) VUINT32 TinyRange[4][1];
> +       __declspec(align(16)) VUINT32 sPoly[8][4][1];
>         __declspec(align(16)) VUINT32 sLn2[4][1];
> +       __declspec(align(16)) VUINT32 TinyRange[4][1];
>  } __svml_satanh_data_internal;
>  #endif
> +
>  __svml_satanh_data_internal:
> -       /* SgnMask */
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
>         /* sOne = SP 1.0 */
>         .align  16
>         .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* sPoly[] = SP polynomial */
> +       /* SgnMask */
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       /* sTopMask12 */
>         .align  16
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> -       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> -       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> -       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> -       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> -       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> -       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> -       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
>         /* iBrkValue = SP 2/3 */
>         .align  16
>         .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> -       /* iOffExpoMask = SP significand mask */
> +       /* iOffExpoMask = SP significand mask ==*/
>         .align  16
>         .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> -       /* sHalf */
> -       .align  16
> -       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> -       /* sSign */
> +
> +       /* sPoly[] = SP polynomial */
>         .align  16
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       /* sTopMask12 */
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> +
> +       /* sLn2 = SP ln(2) */
>         .align  16
> -       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
>         /* TinyRange */
>         .align  16
>         .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> -       /* sLn2 = SP ln(2) */
> -       .align  16
> -       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
>         .align  16
>         .type   __svml_satanh_data_internal, @object
>         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S
  2022-06-09 16:58   ` [PATCH v3 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
@ 2022-06-09 17:07     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 17:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:58 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Reduce code size (-67 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Reduce rodata usage (-448 bytes).
>
> Result is roughly a 14% speedup:
>
>        Function, New Time, Old Time, New / Old
> _ZGVeN16v_tanhf,    0.649,    0.752,     0.863
> ---
>  .../multiarch/svml_s_tanhf16_core_avx512.S    | 527 ++++++++++--------
>  1 file changed, 287 insertions(+), 240 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> index 5b1f9f151c..7edc74a116 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> @@ -70,310 +70,357 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> +/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> +   by use in the function. On cold-starts this might help the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
> +
> +/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
>   */
> -#define _sC                            0
> -#define _sP0                           128
> -#define _sP2                           256
> -#define _sP3                           384
> -#define _sP4                           512
> -#define _sP5                           640
> -#define _sP6                           768
> -#define _sP7                           896
> -#define _iExpMantMask_UISA             1024
> -#define _iMinIdxOfsMask_UISA           1088
> -#define _iMaxIdxMask_UISA              1152
> -#define _sSignMask                     1216
> -#define _sAbsMask                      1280
> -#define _iExpMantMask                  1344
> -#define _iExpMask                      1408
> -#define _iMinIdxOfsMask                        1472
> -#define _iMaxIdxMask                   1536
> +#define _iExpMantMask_UISA             0
> +#define _iMinIdxOfsMask_UISA           4
> +#define _iMaxIdxMask_UISA              8
> +#define _iExpMask                      12
> +
> +/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> +   each.  */
> +#define _sC_lo                         0
> +#define _sC_hi                         64
> +#define _sP7_lo                                128
> +#define _sP7_hi                                192
> +#define _sSignMask                     256
> +#define _sP6_lo                                320
> +#define _sP6_hi                                384
> +#define _sP5_lo                                448
> +#define _sP5_hi                                512
> +#define _sP4_lo                                576
> +#define _sP4_hi                                640
> +#define _sP3_lo                                704
> +#define _sP3_hi                                768
> +#define _sP2_lo                                832
> +#define _sP2_hi                                896
> +#define _sP0_lo                                960
> +#define _sP0_hi                                1024
>
>  #include <sysdep.h>
> +#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> +#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
>
>         .section .text.exex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_tanhf_skx)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-64, %rsp
> -       subq    $192, %rsp
> -       vmovaps %zmm0, %zmm1
> -       vmovups __svml_stanh_data_internal(%rip), %zmm9
> -       vmovups _sP6+__svml_stanh_data_internal(%rip), %zmm11
> -       vmovups _sP5+__svml_stanh_data_internal(%rip), %zmm12
> -       vmovups _sP4+__svml_stanh_data_internal(%rip), %zmm13
> -       vmovups _sP3+__svml_stanh_data_internal(%rip), %zmm14
> -       vmovups _sP2+__svml_stanh_data_internal(%rip), %zmm15
> -       vpternlogd $255, %zmm2, %zmm2, %zmm2
> -       vandps  _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> -       vandps  _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> -
>         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       vpandd  _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> -       vpsubd  _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> -       vpcmpd  $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> +       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> +       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
>
> -       /*
> -        *  small table specific variables *
> -        *  Constant loading
> -        */
> -       vpxord  %zmm5, %zmm5, %zmm5
> -
> -       /* if VMIN, VMAX is defined for I type */
> -       vpmaxsd %zmm5, %zmm4, %zmm6
> -       vpminsd _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> -       vpsrld  $21, %zmm7, %zmm10
> -       vmovups _sP7+__svml_stanh_data_internal(%rip), %zmm4
> -       vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> -       vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> -       vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> -       vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> -       vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> -       vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> -       vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> -       vpandnd %zmm3, %zmm3, %zmm2{%k1}
> -       vptestmd %zmm2, %zmm2, %k0
> -       vmovups _sP0+__svml_stanh_data_internal(%rip), %zmm3
> -       vsubps  {rn-sae}, %zmm9, %zmm8, %zmm2
> -       kmovw   %k0, %edx
> -       vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> -       vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> -       vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> -       vorps   %zmm0, %zmm4, %zmm0
> -       testl   %edx, %edx
> +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> +       vpxord  %zmm3, %zmm3, %zmm3
> +       vpmaxsd %zmm3, %zmm2, %zmm3
> +       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> +       /* Setup permute indices in zmm3.  */
> +       vpsrld  $21, %zmm3, %zmm3
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       /* Store if there are any special cases in k1.  */
> +       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
>
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> -       ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
>
> -       /* Branch to process
> -        * special inputs
> -        */
> +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
>
> -L(SPECIAL_VALUES_BRANCH):
> -       vmovups %zmm1, 64(%rsp)
> -       vmovups %zmm0, 128(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0
> +       /* Store absolute values of inputs in zmm1.  */
> +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> +       vandnps %zmm0, %zmm4, %zmm1
> +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
>
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
>
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> +
> +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
>
> -       /* Special inputs
> -        * processing loop
> +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> +
> +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> +
> +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> +
> +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> +
> +       kmovw   %k1, %edx
> +       testl   %edx, %edx
> +
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> +       /* Wait until after branch of write over zmm0.  */
> +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> +
> +       /* No stack restoration on the fastpath.  */
> +       ret
> +
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a tanhf call. Optimize for code size
> +          more so than speed here. */
> +L(SPECIAL_VALUES_BRANCH):
> +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
> +
> +       /* Align stack and make room for 2x zmm vectors.  */
> +       andq    $-64, %rsp
> +       addq    $-128, %rsp
> +
> +       /* Save original input (zmm0 unchanged up to this point).  */
> +       vmovaps %zmm0, 64(%rsp)
> +       /* Save all already computed inputs.  */
> +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> +       vmovaps %zmm0, (%rsp)
>
> +       vzeroupper
> +
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $16, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 128(%rsp), %zmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 zmm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
>
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   64(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
> +       /* All results have been written to (%rsp).  */
> +       vmovaps (%rsp), %zmm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVeN16v_tanhf_skx)
>
>         .section .rodata, "a"
> -       .align  64
> -
> +       .align  16
>  #ifdef __svml_stanh_data_internal_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(64)) VUINT32 _sC[32][1];
> -       __declspec(align(64)) VUINT32 _sP0[32][1];
> -       __declspec(align(64)) VUINT32 _sP2[32][1];
> -       __declspec(align(64)) VUINT32 _sP3[32][1];
> -       __declspec(align(64)) VUINT32 _sP4[32][1];
> -       __declspec(align(64)) VUINT32 _sP5[32][1];
> -       __declspec(align(64)) VUINT32 _sP6[32][1];
> -       __declspec(align(64)) VUINT32 _sP7[32][1];
> -       __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> -       __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> -       __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> +typedef struct
> +       {
> +       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> +       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> +       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> +       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> +       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
>         __declspec(align(64)) VUINT32 _sSignMask[16][1];
> -       __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> -       __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> -       __declspec(align(64)) VUINT32 _iExpMask[16][1];
> -       __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> -       __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> +       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> +       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> +       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
>  } __svml_stanh_data_internal;
>  #endif
> +
>  __svml_stanh_data_internal:
> -       /* _sC */
> +       .align  4
> +       /* _iExpMantMask_UISA */
> +       .long   0x7fe00000
> +
> +       .align  4
> +       /* _iMinIdxOfsMask_UISA */
> +       .long   0x3d400000
> +
> +       .align  4
> +       /* _iMaxIdxMask_UISA */
> +       .long   0x03e00000
> +
> +       .align  4
> +       /* _iExpMask */
> +       .long   0x7f000000
> +
> +       .align  64
> +__svml_stanh_data_internal_al64:
> +       .align  64
> +       /* _sC_lo */
>         .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
>         .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
>         .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
>         .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> +
> +       .align  64
> +       /* _sC_hi */
>         .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
>         .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
>         .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
>         .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> -       /* p0 */
> -       .align  64
> -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> -       /* p2 */
> -       .align  64
> -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> -       /* p3 */
> +
>         .align  64
> -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> -       /* p4 */
> +       /* _sP7_lo */
> +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> +
>         .align  64
> -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> -       /* p5 */
> +       /* _sP7_hi */
> +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> +
>         .align  64
> -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> -       /* p6 */
> +       /* _sSignMask */
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +
>         .align  64
> +       /* _sP6_lo */
>         .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
>         .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
>         .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
>         .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> +
> +       .align  64
> +       /* _sP6_hi */
>         .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
>         .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
>         .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
>         .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> -       /* p7 */
> +
>         .align  64
> -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> +       /* _sP5_lo */
> +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> +
>         .align  64
> -       .long   0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMantMask_UISA */
> +       /* _sP5_hi */
> +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> +
>         .align  64
> -       .long   0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000 /* _iMinIdxOfsMask_UISA */
> +       /* _sP4_lo */
> +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> +
>         .align  64
> -       .long   0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000 /* _iMaxIdxMask_UISA */
> +       /* _sP4_hi */
> +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> +
>         .align  64
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> +       /* _sP3_lo */
> +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> +
>         .align  64
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> +       /* _sP3_hi */
> +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> +
>         .align  64
> -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> +       /* _sP2_lo */
> +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> +
>         .align  64
> -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> +       /* _sP2_hi */
> +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> +
>         .align  64
> -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> +       /* _sP0_lo */
> +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> +
>         .align  64
> -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> +       /* _sP0_hi */
> +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> +
>         .align  64
> +       .type   __svml_stanh_data_internal_al64, @object
> +       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
>         .type   __svml_stanh_data_internal, @object
>         .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S
  2022-06-09 16:58   ` [PATCH v3 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 17:09     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 17:09 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:59 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Reduce code size (-81 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Reduce rodata size (-32 bytes).
>
> Result is roughly a 17-18% speedup:
>
>        Function, New Time, Old Time, New / Old
> _ZGVdN8v_tanhf,     1.977,    2.402,     0.823
> ---
>  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 912 ++++--------------
>  1 file changed, 171 insertions(+), 741 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> index c5c87bf5b0..c40cd57691 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> @@ -70,773 +70,203 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _dbP                           0
> -#define _sSignMask                     4288
> -#define _sAbsMask                      4320
> -#define _iExpMantMask                  4352
> -#define _iExpMask                      4384
> -#define _iMinIdxOfsMask                        4416
> -#define _iMaxIdxMask                   4448
> -
>  #include <sysdep.h>
>
> +/* tanhf data tables for avx2 and sse4 implementatins defined here.
> + */
> +#include "svml_s_tanhf_rodata.S"
> +
>         .section .text.avx2, "ax", @progbits
>  ENTRY(_ZGVdN8v_tanhf_avx2)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-32, %rsp
> -       pushq   %r12
> -       subq    $120, %rsp
> -       lea     _dbP+16+__svml_stanh_data_internal(%rip), %r10
> -       vmovaps %ymm0, %ymm12
> -
>         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       vpand   _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
> +
> +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> +       vpxor   %ymm3, %ymm3, %ymm3
> +       vpmaxsd %ymm3, %ymm2, %ymm2
> +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
>
>         /*
>          *  small table specific variables *
>          *  Constant loading
>          */
> -       vmovups _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> -       vpsubd  _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> -
> -       /* if VMIN, VMAX is defined for I type */
> -       vxorps  %ymm15, %ymm15, %ymm15
> -       vpcmpgtd %ymm15, %ymm9, %ymm0
> -       vpand   %ymm0, %ymm9, %ymm7
> -       vpcmpgtd %ymm8, %ymm9, %ymm6
> -       vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> -       vpsrld  $14, %ymm3, %ymm1
> -       vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> -       vmovmskps %ymm13, %r11d
> -       vandps  _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> -       vandps  _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> -       vextractf128 $1, %ymm1, %xmm2
> -       vmovd   %xmm1, %r9d
> -       vmovd   %xmm2, %ecx
> -       vpextrd $1, %xmm2, %edx
> -       vpextrd $1, %xmm1, %r8d
> -       movslq  %r9d, %r9
> -       movslq  %edx, %rdx
> -       movslq  %r8d, %r8
> -       vpextrd $2, %xmm1, %edi
> -       movslq  %ecx, %rcx
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> -       vpextrd $3, %xmm2, %r12d
> -       vpextrd $3, %xmm1, %esi
> -       vpextrd $2, %xmm2, %eax
> -       movslq  %edi, %rdi
> -       movslq  %r12d, %r12
> -       movslq  %esi, %rsi
> -       movslq  %eax, %rax
> -       vmovupd -16(%r9, %r10), %xmm5
> -       vmovupd -16(%rdx, %r10), %xmm14
> -       vmovupd -16(%rcx, %r10), %xmm13
> -       vmovupd (%r9, %r10), %xmm1
> -       vmovupd (%r8, %r10), %xmm2
> -       vmovupd -16(%r8, %r10), %xmm4
> -       vinsertf128 $1, -16(%rdi, %r10), %ymm5, %ymm15
> -       vinsertf128 $1, -16(%r12, %r10), %ymm14, %ymm3
> -       vinsertf128 $1, -16(%rax, %r10), %ymm13, %ymm6
> -       vinsertf128 $1, (%rdi, %r10), %ymm1, %ymm5
> -       vinsertf128 $1, (%rsi, %r10), %ymm2, %ymm14
> -       vunpcklpd %ymm3, %ymm6, %ymm8
> +       vpsrld  $14, %ymm2, %ymm1
> +
> +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> +          store/load as we can take advantage of store-forwarding.  */
> +       vmovq   %xmm1, %r8
> +       /* We have eliminated all negative values for ymm1 so no need to sign
> +          extend.  */
> +       movl    %r8d, %r9d
> +       shrq    $32, %r8
> +
> +       /* Store base of lookup table in rax.  */
> +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> +
> +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> +          with memory operand. This helps alleviate bottleneck on p5.  */
> +       vmovupd 16(%r9, %rax), %xmm5
> +
> +       vpextrq $1, %xmm1, %rsi
> +       movl    %esi, %edi
> +       shrq    $32, %rsi
> +
> +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
> +
> +       vextracti128 $1, %ymm1, %xmm2
> +       vmovq   %xmm2, %rdx
> +       movl    %edx, %ecx
> +       shrq    $32, %rdx
> +
> +       vmovupd (%rcx, %rax), %xmm6
> +
> +       vpextrq $1, %xmm2, %r10
> +       movl    %r10d, %r11d
> +       shrq    $32, %r10
> +
> +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> +
> +       vmovupd 16(%r8, %rax), %xmm1
> +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> +       vmovupd (%rdx, %rax), %xmm3
> +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> +
> +       vunpcklpd %ymm3, %ymm6, %ymm7
>         vunpckhpd %ymm3, %ymm6, %ymm6
> -       vunpcklpd %ymm14, %ymm5, %ymm3
> -       vunpckhpd %ymm14, %ymm5, %ymm2
> -       vmovupd (%rcx, %r10), %xmm13
> -       vcvtps2pd %xmm10, %ymm5
> -       vextractf128 $1, %ymm10, %xmm10
> -       vfmadd213pd %ymm3, %ymm5, %ymm2
> -       vinsertf128 $1, -16(%rsi, %r10), %ymm4, %ymm0
> -       vmovupd (%rdx, %r10), %xmm4
> -       vunpcklpd %ymm0, %ymm15, %ymm9
> -       vunpckhpd %ymm0, %ymm15, %ymm7
> -       vfmadd213pd %ymm7, %ymm5, %ymm2
> -       vfmadd213pd %ymm9, %ymm5, %ymm2
> -       vinsertf128 $1, (%r12, %r10), %ymm4, %ymm0
> -       vcvtps2pd %xmm10, %ymm4
> -       vinsertf128 $1, (%rax, %r10), %ymm13, %ymm15
> -       vunpcklpd %ymm0, %ymm15, %ymm1
> -       vunpckhpd %ymm0, %ymm15, %ymm0
> -       vfmadd213pd %ymm1, %ymm4, %ymm0
> -       vcvtpd2ps %ymm2, %xmm1
> -       vfmadd213pd %ymm6, %ymm4, %ymm0
> -       vfmadd213pd %ymm8, %ymm4, %ymm0
> -       vcvtpd2ps %ymm0, %xmm0
> -       vinsertf128 $1, %xmm0, %ymm1, %ymm2
> -       vorps   %ymm11, %ymm2, %ymm0
> -       testl   %r11d, %r11d
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> +       vunpcklpd %ymm1, %ymm5, %ymm3
> +       vunpckhpd %ymm1, %ymm5, %ymm1
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> +       /* Store special cases in ymm15.  */
> +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
>
> -L(EXIT):
> -       addq    $120, %rsp
> -       cfi_restore(12)
> -       popq    %r12
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> -       ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> +       vandps  %ymm11, %ymm0, %ymm4
>
> -       /* Branch to process
> -        * special inputs
> -        */
> +       vcvtps2pd %xmm4, %ymm5
>
> -L(SPECIAL_VALUES_BRANCH):
> -       vmovups %ymm12, 32(%rsp)
> -       vmovups %ymm0, 64(%rsp)
> -       # LOE rbx r13 r14 r15 r11d ymm0
> +       vextractf128 $1, %ymm4, %xmm4
> +       vcvtps2pd %xmm4, %ymm4
>
> -       xorl    %r12d, %r12d
> -       # LOE rbx r13 r14 r15 r11d r12d
> +       vmovupd 16(%rcx, %rax), %xmm2
> +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
>
> -       vzeroupper
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> -       movl    %r11d, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> +       vfmadd213pd %ymm3, %ymm5, %ymm1
> +
> +       vmovupd 16(%rdx, %rax), %xmm3
> +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> +
> +       vunpcklpd %ymm3, %ymm2, %ymm10
> +       vunpckhpd %ymm3, %ymm2, %ymm2
> +
> +       vfmadd213pd %ymm10, %ymm4, %ymm2
> +       vfmadd213pd %ymm6, %ymm4, %ymm2
> +       vfmadd213pd %ymm7, %ymm4, %ymm2
> +       vcvtpd2ps %ymm2, %xmm2
> +
> +       vmovupd (%r9, %rax), %xmm7
> +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> +
> +       vmovupd (%r8, %rax), %xmm3
> +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> +
> +       vunpckhpd %ymm3, %ymm7, %ymm4
> +       vunpcklpd %ymm3, %ymm7, %ymm7
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       vfmadd213pd %ymm4, %ymm5, %ymm1
> +       vfmadd213pd %ymm7, %ymm5, %ymm1
> +
> +
> +       vcvtpd2ps %ymm1, %xmm1
> +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> +
> +       vmovmskps %ymm15, %edx
> +       vandnps %ymm0, %ymm11, %ymm2
> +       testl   %edx, %edx
> +       /* Go to special inputs processing branch */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
> +       /* Wait until after branch of write over ymm0.  */
> +       vorps   %ymm2, %ymm1, %ymm0
> +       /* No stack restoration on the fastpath.  */
> +       ret
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
>
> -       /* Special inputs
> -        * processing loop
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a tanhf call. Optimize for code size
> +          more so than speed here. */
> +L(SPECIAL_VALUES_BRANCH):
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
> +
> +       /* Align stack and make room for 2x ymm vectors.  */
> +       andq    $-32, %rsp
> +       addq    $-64, %rsp
> +
> +       /* Save all already computed inputs.  */
> +       vorps   %ymm2, %ymm1, %ymm1
> +       vmovaps %ymm1, (%rsp)
> +       /* Save original input (ymm0 unchanged up to this point).  */
> +       vmovaps %ymm0, 32(%rsp)
> +
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $8, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 64(%rsp), %ymm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r13 r14 r15 ymm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
>
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       /* Scalar math fucntion call to process special input.  */
                                    function
> +       movss   32(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> -END(_ZGVdN8v_tanhf_avx2)
>
> -       .section .rodata, "a"
> -       .align  32
> -
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> -       __declspec(align(32)) VUINT32 _sSignMask[8][1];
> -       __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> -       __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> -       __declspec(align(32)) VUINT32 _iExpMask[8][1];
> -       __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> -       __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> -} __svml_stanh_data_internal;
> -#endif
> -__svml_stanh_data_internal:
> -       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> -       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
> -       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
> -       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
> -       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
> -       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
> -       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
> -       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
> -       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
> -       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
> -       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
> -       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
> -       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
> -       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
> -       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
> -       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
> -       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
> -       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
> -       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
> -       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
> -       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
> -       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
> -       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
> -       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
> -       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
> -       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
> -       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
> -       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
> -       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
> -       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
> -       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
> -       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
> -       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
> -       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
> -       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
> -       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
> -       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
> -       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
> -       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
> -       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
> -       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
> -       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
> -       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
> -       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
> -       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
> -       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
> -       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
> -       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
> -       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
> -       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
> -       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
> -       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
> -       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
> -       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
> -       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
> -       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
> -       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
> -       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
> -       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
> -       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
> -       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
> -       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
> -       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
> -       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
> -       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
> -       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
> -       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
> -       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
> -       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
> -       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
> -       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
> -       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
> -       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
> -       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
> -       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
> -       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
> -       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
> -       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
> -       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
> -       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
> -       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
> -       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
> -       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
> -       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
> -       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
> -       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
> -       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
> -       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
> -       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
> -       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
> -       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
> -       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
> -       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
> -       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
> -       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
> -       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
> -       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
> -       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
> -       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
> -       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
> -       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
> -       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
> -       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
> -       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
> -       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
> -       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
> -       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
> -       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
> -       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
> -       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
> -       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
> -       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
> -       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
> -       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
> -       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
> -       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
> -       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
> -       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
> -       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
> -       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
> -       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
> -       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
> -       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
> -       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
> -       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
> -       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
> -       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
> -       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
> -       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
> -       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
> -       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
> -       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
> -       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
> -       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
> -       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
> -       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
> -       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
> -       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
> -       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
> -       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
> -       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
> -       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
> -       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
> -       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
> -       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
> -       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
> -       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
> -       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
> -       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
> -       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
> -       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
> -       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
> -       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
> -       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
> -       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
> -       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
> -       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
> -       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
> -       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
> -       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
> -       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
> -       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
> -       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
> -       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
> -       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
> -       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
> -       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
> -       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
> -       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
> -       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
> -       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
> -       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
> -       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
> -       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
> -       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
> -       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
> -       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
> -       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
> -       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
> -       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
> -       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
> -       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
> -       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
> -       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
> -       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
> -       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
> -       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
> -       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
> -       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
> -       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
> -       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
> -       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
> -       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
> -       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
> -       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
> -       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
> -       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
> -       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
> -       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
> -       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
> -       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
> -       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
> -       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
> -       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
> -       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
> -       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
> -       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
> -       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
> -       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
> -       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
> -       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
> -       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
> -       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
> -       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
> -       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
> -       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
> -       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
> -       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
> -       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
> -       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
> -       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
> -       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
> -       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
> -       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
> -       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
> -       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
> -       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
> -       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
> -       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
> -       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
> -       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
> -       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
> -       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
> -       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
> -       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
> -       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
> -       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
> -       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
> -       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
> -       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
> -       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
> -       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
> -       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
> -       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
> -       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
> -       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
> -       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
> -       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
> -       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
> -       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
> -       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
> -       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
> -       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
> -       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
> -       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
> -       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
> -       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
> -       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
> -       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
> -       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
> -       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
> -       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
> -       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
> -       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
> -       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
> -       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
> -       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
> -       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
> -       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
> -       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
> -       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
> -       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
> -       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
> -       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
> -       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
> -       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
> -       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
> -       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
> -       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
> -       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
> -       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
> -       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
> -       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
> -       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
> -       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
> -       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
> -       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
> -       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
> -       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
> -       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
> -       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
> -       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
> -       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
> -       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
> -       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
> -       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
> -       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
> -       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
> -       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
> -       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
> -       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
> -       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
> -       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
> -       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
> -       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
> -       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
> -       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
> -       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
> -       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
> -       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
> -       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
> -       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
> -       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
> -       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
> -       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
> -       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
> -       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
> -       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
> -       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
> -       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
> -       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
> -       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
> -       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
> -       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
> -       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
> -       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
> -       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
> -       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
> -       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
> -       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
> -       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
> -       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
> -       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
> -       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
> -       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
> -       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
> -       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
> -       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
> -       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
> -       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
> -       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
> -       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
> -       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
> -       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
> -       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
> -       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
> -       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
> -       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
> -       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
> -       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
> -       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
> -       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
> -       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
> -       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
> -       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
> -       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
> -       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
> -       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
> -       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
> -       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
> -       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
> -       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
> -       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
> -       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
> -       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
> -       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
> -       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
> -       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
> -       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
> -       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
> -       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
> -       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
> -       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
> -       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
> -       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
> -       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
> -       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
> -       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
> -       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
> -       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
> -       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
> -       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
> -       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
> -       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
> -       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
> -       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
> -       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
> -       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
> -       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
> -       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
> -       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
> -       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
> -       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
> -       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
> -       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
> -       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
> -       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
> -       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
> -       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
> -       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
> -       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
> -       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
> -       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
> -       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
> -       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
> -       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
> -       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
> -       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
> -       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
> -       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
> -       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
> -       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
> -       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
> -       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
> -       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
> -       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
> -       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
> -       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
> -       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
> -       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
> -       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
> -       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
> -       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
> -       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
> -       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
> -       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
> -       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
> -       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
> -       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
> -       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
> -       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
> -       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
> -       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
> -       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
> -       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
> -       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
> -       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
> -       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
> -       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
> -       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
> -       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
> -       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
> -       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
> -       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
> -       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
> -       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
> -       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
> -       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
> -       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
> -       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
> -       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
> -       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
> -       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
> -       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
> -       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
> -       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
> -       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
> -       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
> -       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
> -       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
> -       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
> -       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
> -       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
> -       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
> -       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
> -       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
> -       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
> -       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
> -       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
> -       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
> -       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
> -       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
> -       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
> -       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
> -       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
> -       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
> -       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
> -       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
> -       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
> -       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
> -       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
> -       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
> -       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
> -       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
> -       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
> -       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
> -       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
> -       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
> -       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
> -       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
> -       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
> -       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
> -       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
> -       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
> -       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
> -       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
> -       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
> -       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
> -       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
> -       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
> -       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
> -       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
> -       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
> -       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
> -       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
> -       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
> -       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
> -       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
> -       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
> -       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
> -       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
> -       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
> -       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
> -       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
> -       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
> -       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
> -       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
> -       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
> -       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
> -       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
> -       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
> -       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
> -       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
> -       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
> -       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
> -       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
> -       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
> -       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
> -       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
> -       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
> -       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
> -       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
> -       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
> -       .quad   0x3ff0000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .align  32
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> -       .align  32
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> -       .align  32
> -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> -       .align  32
> -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> -       .align  32
> -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> -       .align  32
> -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> -       .align  32
> -       .type   __svml_stanh_data_internal, @object
> -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +       /* All results have been written to 32(%rsp).  */

Please remove 32.
> +       vmovups (%rsp), %ymm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
> +END(_ZGVdN8v_tanhf_avx2)
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S
  2022-06-09 16:58   ` [PATCH v3 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
@ 2022-06-09 17:10     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 17:10 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:59 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Reduce code size (-112 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Reduce rodata size (-4k+ rodata is shared with avx2).
>
> Result is roughly a 15-16% speedup:
>
>        Function, New Time, Old Time, New / Old
>  _ZGVbN4v_tanhf,    3.158,    3.749,     0.842
> ---
>  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 865 +++---------------
>  1 file changed, 138 insertions(+), 727 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> index 532ebbac65..bf7687d8ba 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> @@ -70,761 +70,172 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _dbP                           0
> -#define _sSignMask                     4288
> -#define _sAbsMask                      4304
> -#define _iExpMantMask                  4320
> -#define _iExpMask                      4336
> -#define _iMinIdxOfsMask                        4352
> -#define _iMaxIdxMask                   4368
>
>  #include <sysdep.h>
>
> +/* tanhf data tables for avx2 and sse4 implementatins defined here.
> + */
> +#define ONLY_DECL_OFFSET
> +#include "svml_s_tanhf_rodata.S"
> +
>         .section .text.sse4, "ax", @progbits
>  ENTRY(_ZGVbN4v_tanhf_sse4)
> -       subq    $72, %rsp
> -       cfi_def_cfa_offset(80)
> -       movaps  %xmm0, %xmm5
> +       /* Save copy of input in xmm12.  */
> +       movaps  %xmm0, %xmm12
>
>         /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       movdqu  _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> -       lea     _dbP+16+__svml_stanh_data_internal(%rip), %r8
> -       pand    %xmm5, %xmm9
> +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> +       pand    %xmm0, %xmm3
>
> -       /* if VMIN, VMAX is defined for I type */
> +
> +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
>         pxor    %xmm7, %xmm7
> -       movdqa  %xmm9, %xmm6
> -       psubd   _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> +       /* Save xmm3 for special values check at end.  */
> +       movdqa  %xmm3, %xmm8
> +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> +       pmaxsd  %xmm7, %xmm3
> +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> +       psrld   $14, %xmm3
> +
> +       movq    %xmm3, %rcx
> +       movl    %ecx, %edx
> +       shrq    $32, %rcx
> +
> +       pshufd  $0x0e, %xmm3, %xmm3
> +       movq    %xmm3, %rdi
> +       movl    %edi, %esi
> +       shrq    $32, %rdi
> +
> +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> +       andps   %xmm1, %xmm0
> +
> +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> +       movups  (%rdx, %rax), %xmm2
> +       movups  (%rcx, %rax), %xmm6
>
>         /*
>          *  small table specific variables *
>          *  Constant loading
>          */
> -       movdqu  _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> -       movdqa  %xmm9, %xmm11
> -       movdqa  %xmm9, %xmm8
> -       pcmpgtd %xmm10, %xmm11
> -       pcmpgtd %xmm7, %xmm8
> -       movdqa  %xmm11, %xmm14
> -       pand    %xmm8, %xmm9
> -       andps   %xmm11, %xmm10
> -       andnps  %xmm9, %xmm14
> -       orps    %xmm10, %xmm14
> -       psrld   $14, %xmm14
> -       movd    %xmm14, %edx
> -       pshufd  $1, %xmm14, %xmm12
> -       pshufd  $2, %xmm14, %xmm13
> -       movd    %xmm12, %ecx
> -       pshufd  $3, %xmm14, %xmm15
> -       movups  _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> -       movslq  %edx, %rdx
> -       andps   %xmm5, %xmm3
> -       movslq  %ecx, %rcx
> -       pcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> -       movd    %xmm13, %esi
> -       movups  -16(%rdx, %r8), %xmm2
> -       movaps  %xmm2, %xmm0
> -       movd    %xmm15, %edi
> -       movmskps %xmm6, %eax
> -       movups  -16(%rcx, %r8), %xmm6
> -       unpcklpd %xmm6, %xmm0
> +       movaps  %xmm2, %xmm4
> +       movlhps %xmm6, %xmm4
>         unpckhpd %xmm6, %xmm2
> -       cvtps2pd %xmm3, %xmm6
> -       movhlps %xmm3, %xmm3
> -       cvtps2pd %xmm3, %xmm3
> -       movslq  %esi, %rsi
> -       movslq  %edi, %rdi
> -       movups  (%rcx, %r8), %xmm8
> -       movups  (%rdx, %r8), %xmm12
> -       movups  (%rsi, %r8), %xmm13
> -       movaps  %xmm12, %xmm10
> -       movups  (%rdi, %r8), %xmm9
> +
> +       cvtps2pd %xmm0, %xmm6
> +       movhlps %xmm0, %xmm0
> +       cvtps2pd %xmm0, %xmm0
> +
> +       movups  16(%rdx, %rax), %xmm5
> +       movups  16(%rsi, %rax), %xmm13
> +
> +       movaps  %xmm5, %xmm10
>         movaps  %xmm13, %xmm11
> -       unpckhpd %xmm8, %xmm12
> -       unpckhpd %xmm9, %xmm13
> -       mulpd   %xmm6, %xmm12
> -       mulpd   %xmm3, %xmm13
> -       unpcklpd %xmm8, %xmm10
> -       unpcklpd %xmm9, %xmm11
> -       addpd   %xmm10, %xmm12
> +
> +       movups  16(%rcx, %rax), %xmm7
> +       movups  16(%rdi, %rax), %xmm3
> +
> +       unpckhpd %xmm7, %xmm5
> +       unpckhpd %xmm3, %xmm13
> +
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
> +
> +       movlhps %xmm7, %xmm10
> +       movlhps %xmm3, %xmm11
> +
> +       addpd   %xmm10, %xmm5
>         addpd   %xmm11, %xmm13
> -       mulpd   %xmm6, %xmm12
> -       mulpd   %xmm3, %xmm13
> -       addpd   %xmm2, %xmm12
> -       movups  -16(%rsi, %r8), %xmm1
> -       movups  -16(%rdi, %r8), %xmm7
> -       movaps  %xmm1, %xmm14
> -       unpckhpd %xmm7, %xmm1
> -       addpd   %xmm1, %xmm13
> -       mulpd   %xmm12, %xmm6
> -       mulpd   %xmm13, %xmm3
> -       addpd   %xmm0, %xmm6
> -       unpcklpd %xmm7, %xmm14
> -       addpd   %xmm14, %xmm3
> -       cvtpd2ps %xmm6, %xmm0
> -       cvtpd2ps %xmm3, %xmm1
> -       movups  _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> -       movlhps %xmm1, %xmm0
> -       andps   %xmm5, %xmm4
> -       orps    %xmm4, %xmm0
> -       testl   %eax, %eax
>
> -       /* Go to special inputs processing branch */
> -       jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> +       addpd   %xmm2, %xmm5
>
> -L(EXIT):
> -       addq    $72, %rsp
> -       cfi_def_cfa_offset(8)
> -       ret
> -       cfi_def_cfa_offset(80)
> +       movups  (%rsi, %rax), %xmm2
> +       movups  (%rdi, %rax), %xmm7
>
> -       /* Branch to process
> -        * special inputs
> -        */
> +       movaps  %xmm2, %xmm3
>
> -L(SPECIAL_VALUES_BRANCH):
> -       movups  %xmm5, 32(%rsp)
> -       movups  %xmm0, 48(%rsp)
> -       # LOE rbx rbp r12 r13 r14 r15 eax
> -
> -       xorl    %edx, %edx
> -       movq    %r12, 16(%rsp)
> -       cfi_offset(12, -64)
> -       movl    %edx, %r12d
> -       movq    %r13, 8(%rsp)
> -       cfi_offset(13, -72)
> -       movl    %eax, %r13d
> -       movq    %r14, (%rsp)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> -        */
> +       unpckhpd %xmm7, %xmm2
> +       movlhps %xmm7, %xmm3
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       addpd   %xmm13, %xmm2
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx rbp r15 r12d r13d
> +       mulpd   %xmm5, %xmm6
> +       addpd   %xmm4, %xmm6
>
> -       /* Special inputs
> -        * processing loop
> -        */
> +       mulpd   %xmm2, %xmm0
> +       addpd   %xmm3, %xmm0
>
> -L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $4, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx rbp r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       movups  48(%rsp), %xmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       cfi_offset(12, -64)
> -       cfi_offset(13, -72)
> -       cfi_offset(14, -80)
> -       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       cvtpd2ps %xmm0, %xmm2
> +       cvtpd2ps %xmm6, %xmm0
>
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> +       movlhps %xmm2, %xmm0
> +       andnps  %xmm12, %xmm1
> +       orps    %xmm1, %xmm0
>
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> -       call    tanhf@PLT
> -       # LOE rbx rbp r14 r15 r12d r13d xmm0
> +       /* xmm8 contains mask of special values.  */
> +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
>
> -       movss   %xmm0, 48(%rsp, %r14, 4)
> +       movmskps %xmm8, %edx
> +       testl   %edx, %edx
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx rbp r15 r12d r13d
> -END(_ZGVbN4v_tanhf_sse4)
> +       /* Go to special inputs processing branch */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       # LOE rbx rbp r12 r13 r14 r15 xmm0
> +       /* No stack restoration on the fastpath.  */
> +       ret
>
> -       .section .rodata, "a"
> -       .align  16
> -
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> -       __declspec(align(16)) VUINT32 _sSignMask[4][1];
> -       __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> -       __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> -       __declspec(align(16)) VUINT32 _iExpMask[4][1];
> -       __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> -       __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> -} __svml_stanh_data_internal;
> -#endif
> -__svml_stanh_data_internal:
> -       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> -       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
> -       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
> -       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
> -       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
> -       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
> -       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
> -       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
> -       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
> -       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
> -       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
> -       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
> -       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
> -       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
> -       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
> -       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
> -       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
> -       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
> -       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
> -       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
> -       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
> -       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
> -       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
> -       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
> -       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
> -       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
> -       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
> -       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
> -       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
> -       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
> -       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
> -       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
> -       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
> -       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
> -       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
> -       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
> -       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
> -       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
> -       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
> -       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
> -       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
> -       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
> -       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
> -       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
> -       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
> -       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
> -       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
> -       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
> -       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
> -       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
> -       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
> -       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
> -       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
> -       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
> -       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
> -       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
> -       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
> -       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
> -       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
> -       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
> -       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
> -       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
> -       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
> -       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
> -       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
> -       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
> -       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
> -       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
> -       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
> -       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
> -       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
> -       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
> -       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
> -       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
> -       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
> -       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
> -       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
> -       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
> -       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
> -       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
> -       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
> -       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
> -       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
> -       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
> -       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
> -       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
> -       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
> -       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
> -       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
> -       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
> -       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
> -       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
> -       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
> -       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
> -       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
> -       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
> -       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
> -       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
> -       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
> -       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
> -       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
> -       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
> -       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
> -       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
> -       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
> -       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
> -       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
> -       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
> -       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
> -       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
> -       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
> -       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
> -       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
> -       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
> -       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
> -       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
> -       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
> -       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
> -       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
> -       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
> -       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
> -       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
> -       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
> -       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
> -       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
> -       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
> -       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
> -       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
> -       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
> -       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
> -       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
> -       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
> -       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
> -       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
> -       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
> -       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
> -       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
> -       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
> -       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
> -       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
> -       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
> -       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
> -       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
> -       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
> -       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
> -       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
> -       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
> -       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
> -       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
> -       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
> -       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
> -       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
> -       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
> -       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
> -       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
> -       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
> -       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
> -       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
> -       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
> -       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
> -       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
> -       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
> -       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
> -       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
> -       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
> -       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
> -       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
> -       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
> -       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
> -       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
> -       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
> -       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
> -       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
> -       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
> -       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
> -       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
> -       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
> -       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
> -       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
> -       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
> -       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
> -       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
> -       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
> -       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
> -       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
> -       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
> -       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
> -       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
> -       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
> -       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
> -       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
> -       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
> -       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
> -       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
> -       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
> -       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
> -       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
> -       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
> -       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
> -       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
> -       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
> -       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
> -       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
> -       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
> -       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
> -       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
> -       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
> -       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
> -       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
> -       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
> -       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
> -       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
> -       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
> -       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
> -       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
> -       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
> -       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
> -       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
> -       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
> -       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
> -       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
> -       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
> -       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
> -       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
> -       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
> -       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
> -       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
> -       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
> -       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
> -       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
> -       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
> -       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
> -       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
> -       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
> -       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
> -       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
> -       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
> -       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
> -       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
> -       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
> -       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
> -       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
> -       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
> -       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
> -       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
> -       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
> -       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
> -       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
> -       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
> -       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
> -       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
> -       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
> -       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
> -       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
> -       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
> -       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
> -       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
> -       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
> -       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
> -       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
> -       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
> -       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
> -       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
> -       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
> -       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
> -       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
> -       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
> -       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
> -       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
> -       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
> -       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
> -       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
> -       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
> -       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
> -       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
> -       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
> -       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
> -       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
> -       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
> -       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
> -       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
> -       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
> -       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
> -       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
> -       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
> -       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
> -       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
> -       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
> -       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
> -       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
> -       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
> -       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
> -       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
> -       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
> -       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
> -       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
> -       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
> -       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
> -       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
> -       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
> -       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
> -       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
> -       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
> -       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
> -       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
> -       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
> -       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
> -       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
> -       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
> -       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
> -       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
> -       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
> -       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
> -       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
> -       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
> -       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
> -       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
> -       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
> -       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
> -       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
> -       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
> -       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
> -       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
> -       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
> -       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
> -       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
> -       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
> -       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
> -       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
> -       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
> -       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
> -       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
> -       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
> -       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
> -       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
> -       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
> -       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
> -       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
> -       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
> -       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
> -       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
> -       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
> -       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
> -       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
> -       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
> -       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
> -       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
> -       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
> -       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
> -       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
> -       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
> -       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
> -       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
> -       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
> -       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
> -       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
> -       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
> -       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
> -       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
> -       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
> -       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
> -       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
> -       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
> -       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
> -       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
> -       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
> -       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
> -       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
> -       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
> -       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
> -       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
> -       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
> -       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
> -       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
> -       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
> -       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
> -       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
> -       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
> -       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
> -       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
> -       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
> -       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
> -       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
> -       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
> -       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
> -       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
> -       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
> -       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
> -       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
> -       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
> -       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
> -       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
> -       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
> -       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
> -       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
> -       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
> -       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
> -       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
> -       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
> -       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
> -       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
> -       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
> -       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
> -       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
> -       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
> -       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
> -       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
> -       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
> -       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
> -       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
> -       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
> -       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
> -       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
> -       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
> -       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
> -       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
> -       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
> -       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
> -       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
> -       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
> -       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
> -       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
> -       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
> -       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
> -       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
> -       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
> -       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
> -       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
> -       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
> -       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
> -       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
> -       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
> -       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
> -       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
> -       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
> -       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
> -       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
> -       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
> -       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
> -       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
> -       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
> -       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
> -       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
> -       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
> -       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
> -       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
> -       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
> -       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
> -       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
> -       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
> -       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
> -       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
> -       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
> -       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
> -       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
> -       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
> -       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
> -       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
> -       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
> -       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
> -       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
> -       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
> -       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
> -       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
> -       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
> -       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
> -       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
> -       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
> -       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
> -       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
> -       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
> -       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
> -       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
> -       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
> -       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
> -       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
> -       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
> -       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
> -       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
> -       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
> -       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
> -       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
> -       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
> -       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
> -       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
> -       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
> -       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
> -       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
> -       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
> -       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
> -       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
> -       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
> -       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
> -       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
> -       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
> -       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
> -       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
> -       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
> -       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
> -       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
> -       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
> -       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
> -       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
> -       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
> -       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
> -       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
> -       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
> -       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
> -       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
> -       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
> -       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
> -       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
> -       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
> -       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
> -       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
> -       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
> -       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
> -       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
> -       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
> -       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
> -       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
> -       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
> -       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
> -       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
> -       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
> -       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
> -       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
> -       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
> -       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
> -       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
> -       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
> -       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
> -       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
> -       .quad   0x3ff0000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .quad   0x0000000000000000
> -       .align  16
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
> -       .align  16
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
> -       .align  16
> -       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
> -       .align  16
> -       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
> -       .align  16
> -       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
> -       .align  16
> -       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
> -       .align  16
> -       .type   __svml_stanh_data_internal, @object
> -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a tanhf call. Optimize for code size
> +          more so than speed here. */
> +L(SPECIAL_VALUES_BRANCH):
> +       # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
> +       /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
> +       call entry will be 16-byte aligned. */
> +       subq    $56, %rsp
> +       cfi_def_cfa_offset(64)
> +       movups  %xmm0, 24(%rsp)
> +       movups  %xmm12, 40(%rsp)
> +
> +       /* Use rbx/rbp for callee save registers as they get short
> +       encoding for many instructions (as compared with r12/r13). */
> +       movq    %rbx, (%rsp)
> +       cfi_offset(rbx, -64)
> +       movq    %rbp, 8(%rsp)
> +       cfi_offset(rbp, -56)
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %ebx
> +L(SPECIAL_VALUES_LOOP):
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %ebp, %ebp
> +       bsfl    %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   40(%rsp, %rbp, 4), %xmm0
> +       call    tanhf@PLT
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, 24(%rsp, %rbp, 4)
> +
> +       leal    -1(%rbx), %eax
> +       andl    %eax, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +       /* All results have been written to 24(%rsp).  */
> +       movups  24(%rsp), %xmm0
> +       movq    (%rsp), %rbx
> +       cfi_restore(rbx)
> +       movq    8(%rsp), %rbp
> +       cfi_restore(rbp)
> +       addq    $56, %rsp
> +       cfi_def_cfa_offset(8)
> +       ret
> +END(_ZGVbN4v_tanhf_sse4)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4
  2022-06-09 16:58   ` [PATCH v3 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
@ 2022-06-09 17:11     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 17:11 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 9:59 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> tanhf-avx2 and tanhf-sse4 use the same data tables so we can save
> over 4kb using a shared datatable. This does increase the memory
> footprint of the sse4 version (as now all the targets are 32 bytes
> instead of 16), generally it seems worth the code size save.
>
> NB: This patch doesn't do anything itself, it is setup for future
> patches.
> ---
>  .../fpu/multiarch/svml_s_tanhf_rodata.S       | 621 ++++++++++++++++++
>  1 file changed, 621 insertions(+)
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
> new file mode 100644
> index 0000000000..904fe5f588
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
> @@ -0,0 +1,621 @@
> +/* Datatables for  tanhf AVX2 and tanhf SSE4.
> +   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/* Offsets are ordered by use in the function. On cold-starts this
> +   might help the prefetcher. If the streaming prefetchers kick in it
> +   will prefetch into the lookup table.  */
> +#define _iExpMantMask                  0
> +#define _iMinIdxOfsMask                        32
> +#define _iMaxIdxMask                   64
> +#define _sAbsMask                      96
> +#define _iExpMask                      128
> +#define _lookupTable                   160
> +
> +#define TANHF_DATA(offset)             ((offset)+__svml_stanh_data_internal_avx2)
> +#ifndef ONLY_DECL_OFFSET
> +       .section .rodata, "a"
> +       .align  32
> +
> +# ifdef __svml_stanh_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct
> +       {
> +       __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> +       __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> +       __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> +       __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> +       __declspec(align(32)) VUINT32 _iExpMask[8][1];
> +       __declspec(align(32)) VUINT32 _lookupTable[(134*4)][2];
> +} __svml_stanh_data_internal;
> +# endif
> +
> +
> +__svml_stanh_data_internal:
> +       .globl  __svml_stanh_data_internal_avx2
> +__svml_stanh_data_internal_avx2:
> +       .align  32
> +       /* _iExpMantMask.  */
> +       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
> +       .long   0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
> +
> +       .align  32
> +       /* _iMinIdxOfsMask.  */
> +       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
> +       .long   0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
> +
> +       .align  32
> +       /* _iMaxIdxMask.  */
> +       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000
> +       .long   0x04280000, 0x04280000, 0x04280000, 0x04280000
> +
> +       .align  32
> +       /* _sAbsMask.  */
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +
> +       .align  32
> +       /* _iExpMask.  */
> +       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
> +       .long   0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
> +
> +       .align  32
> +       /* _lookupTable.  */
> +       /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500].  */
> +       .quad   0x0000000000000000 /* A00 = +0.000000000000000000000e-01.  */
> +       .quad   0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00.  */
> +       .quad   0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06.  */
> +       .quad   0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01.  */
> +       .quad   0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08.  */
> +       .quad   0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00.  */
> +       .quad   0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05.  */
> +       .quad   0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01.  */
> +       .quad   0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08.  */
> +       .quad   0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00.  */
> +       .quad   0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04.  */
> +       .quad   0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01.  */
> +       .quad   0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08.  */
> +       .quad   0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00.  */
> +       .quad   0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04.  */
> +       .quad   0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01.  */
> +       .quad   0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08.  */
> +       .quad   0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00.  */
> +       .quad   0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04.  */
> +       .quad   0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01.  */
> +       .quad   0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08.  */
> +       .quad   0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00.  */
> +       .quad   0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04.  */
> +       .quad   0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01.  */
> +       .quad   0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08.  */
> +       .quad   0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00.  */
> +       .quad   0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04.  */
> +       .quad   0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01.  */
> +       .quad   0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08.  */
> +       .quad   0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00.  */
> +       .quad   0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04.  */
> +       .quad   0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01.  */
> +       .quad   0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07.  */
> +       .quad   0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00.  */
> +       .quad   0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04.  */
> +       .quad   0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01.  */
> +       .quad   0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07.  */
> +       .quad   0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00.  */
> +       .quad   0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04.  */
> +       .quad   0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01.  */
> +       .quad   0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07.  */
> +       .quad   0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00.  */
> +       .quad   0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04.  */
> +       .quad   0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01.  */
> +       .quad   0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07.  */
> +       .quad   0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00.  */
> +       .quad   0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04.  */
> +       .quad   0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01.  */
> +       .quad   0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07.  */
> +       .quad   0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00.  */
> +       .quad   0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04.  */
> +       .quad   0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01.  */
> +       .quad   0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07.  */
> +       .quad   0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00.  */
> +       .quad   0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04.  */
> +       .quad   0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01.  */
> +       .quad   0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07.  */
> +       .quad   0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00.  */
> +       .quad   0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04.  */
> +       .quad   0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01.  */
> +       .quad   0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07.  */
> +       .quad   0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00.  */
> +       .quad   0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04.  */
> +       .quad   0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01.  */
> +       .quad   0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07.  */
> +       .quad   0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00.  */
> +       .quad   0xBF44376634149405 /* A02 = -6.169556656102642569831e-04.  */
> +       .quad   0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01.  */
> +       .quad   0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07.  */
> +       .quad   0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00.  */
> +       .quad   0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04.  */
> +       .quad   0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01.  */
> +       .quad   0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07.  */
> +       .quad   0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00.  */
> +       .quad   0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04.  */
> +       .quad   0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01.  */
> +       .quad   0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06.  */
> +       .quad   0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00.  */
> +       .quad   0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04.  */
> +       .quad   0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01.  */
> +       .quad   0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06.  */
> +       .quad   0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00.  */
> +       .quad   0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03.  */
> +       .quad   0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01.  */
> +       .quad   0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06.  */
> +       .quad   0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00.  */
> +       .quad   0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03.  */
> +       .quad   0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01.  */
> +       .quad   0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06.  */
> +       .quad   0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00.  */
> +       .quad   0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03.  */
> +       .quad   0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01.  */
> +       .quad   0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06.  */
> +       .quad   0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00.  */
> +       .quad   0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03.  */
> +       .quad   0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01.  */
> +       .quad   0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06.  */
> +       .quad   0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00.  */
> +       .quad   0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03.  */
> +       .quad   0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01.  */
> +       .quad   0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06.  */
> +       .quad   0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00.  */
> +       .quad   0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03.  */
> +       .quad   0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01.  */
> +       .quad   0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06.  */
> +       .quad   0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00.  */
> +       .quad   0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03.  */
> +       .quad   0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01.  */
> +       .quad   0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06.  */
> +       .quad   0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00.  */
> +       .quad   0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03.  */
> +       .quad   0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01.  */
> +       .quad   0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06.  */
> +       .quad   0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00.  */
> +       .quad   0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03.  */
> +       .quad   0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01.  */
> +       .quad   0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06.  */
> +       .quad   0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00.  */
> +       .quad   0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03.  */
> +       .quad   0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01.  */
> +       .quad   0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05.  */
> +       .quad   0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00.  */
> +       .quad   0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03.  */
> +       .quad   0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01.  */
> +       .quad   0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05.  */
> +       .quad   0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00.  */
> +       .quad   0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03.  */
> +       .quad   0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01.  */
> +       .quad   0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05.  */
> +       .quad   0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00.  */
> +       .quad   0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03.  */
> +       .quad   0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01.  */
> +       .quad   0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05.  */
> +       .quad   0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00.  */
> +       .quad   0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03.  */
> +       .quad   0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01.  */
> +       .quad   0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05.  */
> +       .quad   0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00.  */
> +       .quad   0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03.  */
> +       .quad   0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01.  */
> +       .quad   0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05.  */
> +       .quad   0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00.  */
> +       .quad   0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03.  */
> +       .quad   0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01.  */
> +       .quad   0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05.  */
> +       .quad   0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00.  */
> +       .quad   0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03.  */
> +       .quad   0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01.  */
> +       .quad   0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05.  */
> +       .quad   0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00.  */
> +       .quad   0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02.  */
> +       .quad   0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01.  */
> +       .quad   0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05.  */
> +       .quad   0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00.  */
> +       .quad   0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02.  */
> +       .quad   0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01.  */
> +       .quad   0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05.  */
> +       .quad   0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00.  */
> +       .quad   0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02.  */
> +       .quad   0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01.  */
> +       .quad   0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04.  */
> +       .quad   0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00.  */
> +       .quad   0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02.  */
> +       .quad   0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01.  */
> +       .quad   0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04.  */
> +       .quad   0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00.  */
> +       .quad   0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02.  */
> +       .quad   0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01.  */
> +       .quad   0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04.  */
> +       .quad   0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00.  */
> +       .quad   0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02.  */
> +       .quad   0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01.  */
> +       .quad   0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04.  */
> +       .quad   0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00.  */
> +       .quad   0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02.  */
> +       .quad   0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01.  */
> +       .quad   0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04.  */
> +       .quad   0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00.  */
> +       .quad   0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02.  */
> +       .quad   0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01.  */
> +       .quad   0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04.  */
> +       .quad   0x3FF012577608921B /* A01 = +1.004477940624503018441e+00.  */
> +       .quad   0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02.  */
> +       .quad   0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01.  */
> +       .quad   0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04.  */
> +       .quad   0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00.  */
> +       .quad   0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02.  */
> +       .quad   0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01.  */
> +       .quad   0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04.  */
> +       .quad   0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00.  */
> +       .quad   0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02.  */
> +       .quad   0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01.  */
> +       .quad   0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04.  */
> +       .quad   0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00.  */
> +       .quad   0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02.  */
> +       .quad   0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01.  */
> +       .quad   0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04.  */
> +       .quad   0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00.  */
> +       .quad   0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02.  */
> +       .quad   0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01.  */
> +       .quad   0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04.  */
> +       .quad   0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00.  */
> +       .quad   0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02.  */
> +       .quad   0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01.  */
> +       .quad   0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04.  */
> +       .quad   0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00.  */
> +       .quad   0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02.  */
> +       .quad   0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01.  */
> +       .quad   0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03.  */
> +       .quad   0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00.  */
> +       .quad   0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02.  */
> +       .quad   0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01.  */
> +       .quad   0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03.  */
> +       .quad   0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00.  */
> +       .quad   0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02.  */
> +       .quad   0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01.  */
> +       .quad   0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03.  */
> +       .quad   0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00.  */
> +       .quad   0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02.  */
> +       .quad   0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01.  */
> +       .quad   0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03.  */
> +       .quad   0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00.  */
> +       .quad   0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02.  */
> +       .quad   0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01.  */
> +       .quad   0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03.  */
> +       .quad   0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00.  */
> +       .quad   0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01.  */
> +       .quad   0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01.  */
> +       .quad   0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03.  */
> +       .quad   0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00.  */
> +       .quad   0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01.  */
> +       .quad   0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01.  */
> +       .quad   0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03.  */
> +       .quad   0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00.  */
> +       .quad   0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01.  */
> +       .quad   0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01.  */
> +       .quad   0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03.  */
> +       .quad   0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00.  */
> +       .quad   0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01.  */
> +       .quad   0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01.  */
> +       .quad   0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03.  */
> +       .quad   0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00.  */
> +       .quad   0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01.  */
> +       .quad   0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01.  */
> +       .quad   0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03.  */
> +       .quad   0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00.  */
> +       .quad   0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01.  */
> +       .quad   0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01.  */
> +       .quad   0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03.  */
> +       .quad   0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00.  */
> +       .quad   0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01.  */
> +       .quad   0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01.  */
> +       .quad   0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03.  */
> +       .quad   0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00.  */
> +       .quad   0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01.  */
> +       .quad   0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01.  */
> +       .quad   0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03.  */
> +       .quad   0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00.  */
> +       .quad   0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01.  */
> +       .quad   0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02.  */
> +       .quad   0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02.  */
> +       .quad   0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00.  */
> +       .quad   0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01.  */
> +       .quad   0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02.  */
> +       .quad   0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02.  */
> +       .quad   0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00.  */
> +       .quad   0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01.  */
> +       .quad   0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02.  */
> +       .quad   0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02.  */
> +       .quad   0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00.  */
> +       .quad   0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01.  */
> +       .quad   0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02.  */
> +       .quad   0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02.  */
> +       .quad   0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00.  */
> +       .quad   0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01.  */
> +       .quad   0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02.  */
> +       .quad   0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02.  */
> +       .quad   0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00.  */
> +       .quad   0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01.  */
> +       .quad   0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03.  */
> +       .quad   0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02.  */
> +       .quad   0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00.  */
> +       .quad   0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01.  */
> +       .quad   0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03.  */
> +       .quad   0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02.  */
> +       .quad   0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00.  */
> +       .quad   0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01.  */
> +       .quad   0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02.  */
> +       .quad   0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02.  */
> +       .quad   0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00.  */
> +       .quad   0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01.  */
> +       .quad   0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02.  */
> +       .quad   0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02.  */
> +       .quad   0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00.  */
> +       .quad   0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01.  */
> +       .quad   0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02.  */
> +       .quad   0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02.  */
> +       .quad   0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00.  */
> +       .quad   0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01.  */
> +       .quad   0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02.  */
> +       .quad   0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02.  */
> +       .quad   0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00.  */
> +       .quad   0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01.  */
> +       .quad   0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02.  */
> +       .quad   0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02.  */
> +       .quad   0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00.  */
> +       .quad   0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01.  */
> +       .quad   0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02.  */
> +       .quad   0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02.  */
> +       .quad   0x3FF528650340490B /* A01 = +1.322361958217302513319e+00.  */
> +       .quad   0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01.  */
> +       .quad   0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02.  */
> +       .quad   0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02.  */
> +       .quad   0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00.  */
> +       .quad   0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01.  */
> +       .quad   0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02.  */
> +       .quad   0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02.  */
> +       .quad   0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00.  */
> +       .quad   0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01.  */
> +       .quad   0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02.  */
> +       .quad   0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02.  */
> +       .quad   0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00.  */
> +       .quad   0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01.  */
> +       .quad   0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01.  */
> +       .quad   0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02.  */
> +       .quad   0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00.  */
> +       .quad   0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01.  */
> +       .quad   0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01.  */
> +       .quad   0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02.  */
> +       .quad   0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00.  */
> +       .quad   0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01.  */
> +       .quad   0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01.  */
> +       .quad   0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02.  */
> +       .quad   0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00.  */
> +       .quad   0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01.  */
> +       .quad   0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01.  */
> +       .quad   0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02.  */
> +       .quad   0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00.  */
> +       .quad   0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01.  */
> +       .quad   0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01.  */
> +       .quad   0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02.  */
> +       .quad   0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00.  */
> +       .quad   0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01.  */
> +       .quad   0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01.  */
> +       .quad   0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02.  */
> +       .quad   0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00.  */
> +       .quad   0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01.  */
> +       .quad   0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01.  */
> +       .quad   0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02.  */
> +       .quad   0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00.  */
> +       .quad   0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01.  */
> +       .quad   0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02.  */
> +       .quad   0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02.  */
> +       .quad   0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00.  */
> +       .quad   0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01.  */
> +       .quad   0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02.  */
> +       .quad   0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02.  */
> +       .quad   0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00.  */
> +       .quad   0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01.  */
> +       .quad   0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02.  */
> +       .quad   0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03.  */
> +       .quad   0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00.  */
> +       .quad   0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01.  */
> +       .quad   0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02.  */
> +       .quad   0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02.  */
> +       .quad   0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00.  */
> +       .quad   0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01.  */
> +       .quad   0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02.  */
> +       .quad   0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02.  */
> +       .quad   0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00.  */
> +       .quad   0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01.  */
> +       .quad   0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02.  */
> +       .quad   0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02.  */
> +       .quad   0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00.  */
> +       .quad   0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01.  */
> +       .quad   0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02.  */
> +       .quad   0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01.  */
> +       .quad   0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01.  */
> +       .quad   0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01.  */
> +       .quad   0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02.  */
> +       .quad   0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01.  */
> +       .quad   0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01.  */
> +       .quad   0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01.  */
> +       .quad   0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02.  */
> +       .quad   0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01.  */
> +       .quad   0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01.  */
> +       .quad   0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01.  */
> +       .quad   0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02.  */
> +       .quad   0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01.  */
> +       .quad   0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01.  */
> +       .quad   0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01.  */
> +       .quad   0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02.  */
> +       .quad   0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01.  */
> +       .quad   0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01.  */
> +       .quad   0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01.  */
> +       .quad   0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02.  */
> +       .quad   0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01.  */
> +       .quad   0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01.  */
> +       .quad   0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01.  */
> +       .quad   0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02.  */
> +       .quad   0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01.  */
> +       .quad   0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01.  */
> +       .quad   0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01.  */
> +       .quad   0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02.  */
> +       .quad   0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01.  */
> +       .quad   0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01.  */
> +       .quad   0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01.  */
> +       .quad   0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02.  */
> +       .quad   0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01.  */
> +       .quad   0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01.  */
> +       .quad   0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01.  */
> +       .quad   0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02.  */
> +       .quad   0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01.  */
> +       .quad   0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01.  */
> +       .quad   0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02.  */
> +       .quad   0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03.  */
> +       .quad   0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01.  */
> +       .quad   0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01.  */
> +       .quad   0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02.  */
> +       .quad   0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03.  */
> +       .quad   0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01.  */
> +       .quad   0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01.  */
> +       .quad   0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02.  */
> +       .quad   0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03.  */
> +       .quad   0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01.  */
> +       .quad   0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01.  */
> +       .quad   0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02.  */
> +       .quad   0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03.  */
> +       .quad   0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01.  */
> +       .quad   0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01.  */
> +       .quad   0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02.  */
> +       .quad   0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03.  */
> +       .quad   0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01.  */
> +       .quad   0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01.  */
> +       .quad   0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02.  */
> +       .quad   0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03.  */
> +       .quad   0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01.  */
> +       .quad   0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01.  */
> +       .quad   0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02.  */
> +       .quad   0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03.  */
> +       .quad   0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01.  */
> +       .quad   0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02.  */
> +       .quad   0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02.  */
> +       .quad   0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03.  */
> +       .quad   0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01.  */
> +       .quad   0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02.  */
> +       .quad   0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02.  */
> +       .quad   0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03.  */
> +       .quad   0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01.  */
> +       .quad   0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02.  */
> +       .quad   0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02.  */
> +       .quad   0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03.  */
> +       .quad   0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01.  */
> +       .quad   0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02.  */
> +       .quad   0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03.  */
> +       .quad   0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04.  */
> +       .quad   0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01.  */
> +       .quad   0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02.  */
> +       .quad   0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03.  */
> +       .quad   0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04.  */
> +       .quad   0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01.  */
> +       .quad   0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02.  */
> +       .quad   0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03.  */
> +       .quad   0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04.  */
> +       .quad   0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01.  */
> +       .quad   0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02.  */
> +       .quad   0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03.  */
> +       .quad   0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04.  */
> +       .quad   0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01.  */
> +       .quad   0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03.  */
> +       .quad   0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03.  */
> +       .quad   0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05.  */
> +       .quad   0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01.  */
> +       .quad   0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03.  */
> +       .quad   0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03.  */
> +       .quad   0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05.  */
> +       .quad   0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01.  */
> +       .quad   0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03.  */
> +       .quad   0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04.  */
> +       .quad   0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05.  */
> +       .quad   0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01.  */
> +       .quad   0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03.  */
> +       .quad   0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04.  */
> +       .quad   0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05.  */
> +       .quad   0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01.  */
> +       .quad   0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03.  */
> +       .quad   0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04.  */
> +       .quad   0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05.  */
> +       .quad   0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01.  */
> +       .quad   0x3F5229111365657E /* A01 = +1.108423877289460134782e-03.  */
> +       .quad   0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04.  */
> +       .quad   0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06.  */
> +       .quad   0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01.  */
> +       .quad   0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04.  */
> +       .quad   0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04.  */
> +       .quad   0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06.  */
> +       .quad   0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01.  */
> +       .quad   0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04.  */
> +       .quad   0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05.  */
> +       .quad   0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06.  */
> +       .quad   0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01.  */
> +       .quad   0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04.  */
> +       .quad   0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05.  */
> +       .quad   0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06.  */
> +       .quad   0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01.  */
> +       .quad   0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04.  */
> +       .quad   0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05.  */
> +       .quad   0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06.  */
> +       .quad   0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01.  */
> +       .quad   0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04.  */
> +       .quad   0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05.  */
> +       .quad   0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07.  */
> +       .quad   0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01.  */
> +       .quad   0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05.  */
> +       .quad   0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06.  */
> +       .quad   0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07.  */
> +       .quad   0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01.  */
> +       .quad   0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05.  */
> +       .quad   0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06.  */
> +       .quad   0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07.  */
> +       .quad   0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01.  */
> +       .quad   0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05.  */
> +       .quad   0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06.  */
> +       .quad   0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08.  */
> +       .quad   0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01.  */
> +       .quad   0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06.  */
> +       .quad   0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07.  */
> +       .quad   0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08.  */
> +       .quad   0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01.  */
> +       .quad   0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06.  */
> +       .quad   0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07.  */
> +       .quad   0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09.  */
> +       .quad   0x3ff0000000000000
> +       .quad   0x0000000000000000
> +       .quad   0x0000000000000000
> +       .quad   0x0000000000000000
> +
> +       .align  32
> +       .type   __svml_stanh_data_internal_avx2, @object
> +       .size   __svml_stanh_data_internal_avx2, .-__svml_stanh_data_internal_avx2
> +       .type   __svml_stanh_data_internal, @object
> +       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +#endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v4 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
                   ` (7 preceding siblings ...)
  2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
@ 2022-06-09 18:16 ` Noah Goldstein
  2022-06-09 18:16   ` [PATCH v4 2/7] x86: Improve svml_s_atanhf8_core_avx2.S Noah Goldstein
                     ` (2 more replies)
  8 siblings, 3 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 18:16 UTC (permalink / raw)
  To: libc-alpha

Improvements are:
    1. Reduce code size (-64 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata size ([-128, -188] bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

        Function, New Time, Old Time, New / Old
_ZGVeN16v_atanhf,     1.39,    1.408,     0.987
---
 .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
 1 file changed, 244 insertions(+), 230 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index a1cd920a0f..f42462c581 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -31,53 +31,50 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512
- */
-#define Log_tbl_H			0
-#define Log_tbl_L			128
-#define One				256
-#define AbsMask				320
-#define AddB5				384
-#define RcpBitMask			448
-#define poly_coeff3			512
-#define poly_coeff2			576
-#define poly_coeff1			640
-#define poly_coeff0			704
-#define Half				768
-#define L2H				832
-#define L2L				896
+/* Offsets for data table __svml_satanh_data_internal_avx512 and
+   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
+   function. On cold-starts this might help the prefetcher. Possibly
+   a better idea is to interleave start/end so that the prefetcher is
+   less likely to detect a stream and pull irrelivant lines into
+   cache.  */
+
+/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
+   the memory is broadcast to {1to16}.  */
+#define AbsMask				0
+
+/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
+   is used here.  */
+#define One				0
+#define AddB5				64
+#define RcpBitMask			128
+#define Log_tbl_L_lo			192
+#define Log_tbl_L_hi			256
+#define Log_tbl_H_lo			320
+#define Log_tbl_H_hi			384
+#define L2H				448
+#define L2L				512
+#define poly_coeff3			576
+#define poly_coeff2			640
+#define poly_coeff1			704
 
 #include <sysdep.h>
 
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal_avx512_al64)
+
 	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanhf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	vmovups	One+__svml_satanh_data_internal_avx512(%rip), %zmm4
-
-	/* round reciprocals to 1+5b mantissas */
-	vmovups	AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
-	vmovups	RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
-	vmovaps	%zmm0, %zmm11
-	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
+	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
+	vmovups	ATANHF_DATA(One)(%rip), %zmm4
 
 	/* 1+y */
 	vaddps	{rn-sae}, %zmm4, %zmm6, %zmm9
 
 	/* 1-y */
 	vsubps	{rn-sae}, %zmm6, %zmm4, %zmm8
-	vxorps	%zmm6, %zmm11, %zmm10
-
-	/* Yp_high */
-	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
 
-	/* -Ym_high */
-	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+	/* round reciprocals to 1+5b mantissas */
+	vmovups	ATANHF_DATA(AddB5)(%rip), %zmm14
+	vmovups	ATANHF_DATA(RcpBitMask)(%rip), %zmm1
 
 	/* RcpP ~ 1/Yp */
 	vrcp14ps %zmm9, %zmm12
@@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	/* RcpM ~ 1/Ym */
 	vrcp14ps %zmm8, %zmm13
 
+	/* Yp_high */
+	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
+
+	/* -Ym_high */
+	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+
+
 	/* input outside (-1, 1) ? */
-	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
 	vpaddd	%zmm14, %zmm12, %zmm15
-	vpaddd	%zmm14, %zmm13, %zmm0
+	vpaddd	%zmm14, %zmm13, %zmm12
 
 	/* Yp_low */
 	vsubps	{rn-sae}, %zmm2, %zmm6, %zmm3
 	vandps	%zmm1, %zmm15, %zmm7
-	vandps	%zmm1, %zmm0, %zmm12
+	vandps	%zmm1, %zmm12, %zmm12
 
 	/* Ym_low */
 	vaddps	{rn-sae}, %zmm5, %zmm6, %zmm5
@@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
 
 	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
-	vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
-	vmovups	Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
-	vmovups	Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
+
+	vmovups	ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
+	vmovups	ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
 
 	/* exponents */
-	vgetexpps {sae}, %zmm7, %zmm15
 	vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
+	vgetexpps {sae}, %zmm7, %zmm15
+
 
 	/* Table lookups */
-	vmovups	__svml_satanh_data_internal_avx512(%rip), %zmm6
+	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
 	vgetexpps {sae}, %zmm12, %zmm14
-	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
+
 
 	/* Prepare table index */
 	vpsrld	$18, %zmm7, %zmm3
 	vpsrld	$18, %zmm12, %zmm2
-	vmovups	Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
-
+	vmovups	ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
+	vmovups	ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
 	/* Km-Kp */
+
+	vmovaps	%zmm3, %zmm5
+	vpermi2ps %zmm13, %zmm10, %zmm3
+	vpermt2ps %zmm13, %zmm2, %zmm10
+	vpermi2ps %zmm7, %zmm11, %zmm5
+	vpermt2ps %zmm7, %zmm2, %zmm11
 	vsubps	{rn-sae}, %zmm15, %zmm14, %zmm1
-	kmovw	%k0, %edx
-	vmovaps	%zmm3, %zmm0
-	vpermi2ps %zmm13, %zmm8, %zmm3
-	vpermt2ps %zmm13, %zmm2, %zmm8
-	vpermi2ps %zmm7, %zmm6, %zmm0
-	vpermt2ps %zmm7, %zmm2, %zmm6
-	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm5
+	vsubps	{rn-sae}, %zmm3, %zmm10, %zmm7
 
 	/* K*L2H + Th */
-	vmovups	L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
+	vmovups	ATANHF_DATA(L2H)(%rip), %zmm2
 
 	/* K*L2L + Tl */
-	vmovups	L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
-
-	/* polynomials */
-	vmovups	poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
-	vmovups	poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
+	vmovups	ATANHF_DATA(L2L)(%rip), %zmm3
 
 	/* table values */
-	vsubps	{rn-sae}, %zmm0, %zmm6, %zmm0
-	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
-	vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
-	vmovups	poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
-	vmovaps	%zmm3, %zmm2
-	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
-	vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
-	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
-	vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
-	vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
+	vsubps	{rn-sae}, %zmm5, %zmm11, %zmm5
+	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
+	vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
+	/* polynomials */
+	vmovups	ATANHF_DATA(poly_coeff3)(%rip), %zmm7
+	vmovups	ATANHF_DATA(poly_coeff2)(%rip), %zmm10
+	vmovaps	%zmm10, %zmm14
+	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
+	vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
+	vmovups	ATANHF_DATA(poly_coeff1)(%rip), %zmm12
+	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
+	vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
 
 	/* (K*L2L + Tl) + Rp*PolyP */
-	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
-	vorps	Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
+	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
+
+	/* zmm12 = zmm12 & (zmm4 | zmm0).  */
+	vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
 
 	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
-	vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
-	vaddps	{rn-sae}, %zmm3, %zmm0, %zmm4
-	vmulps	{rn-sae}, %zmm9, %zmm4, %zmm0
+	vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
+	vaddps	{rn-sae}, %zmm14, %zmm10, %zmm8
+
+	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
+	kmovw	%k0, %edx
 	testl	%edx, %edx
 
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
+	# LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm0
 
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	/* No register to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm11, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm1
+	vmovaps	%zmm1, (%rsp)
+	vmovaps	%zmm0, 64(%rsp)
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
-
-	movss	%xmm0, 128(%rsp, %r14, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
+	/* All results have been written to (%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVeN16v_atanhf_skx)
 
 	.section .rodata, "a"
-	.align	64
-
+	.align	4
 #ifdef __svml_satanh_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 Log_tbl_H[32][1];
-	__declspec(align(64)) VUINT32 Log_tbl_L[32][1];
+typedef struct{
+	__declspec(align(4)) VUINT32 AbsMask[1][1];
 	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 AbsMask[16][1];
 	__declspec(align(64)) VUINT32 AddB5[16][1];
 	__declspec(align(64)) VUINT32 RcpBitMask[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
+	__declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
+	__declspec(align(64)) VUINT32 L2H[16][1];
+	__declspec(align(64)) VUINT32 L2L[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff3[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff2[16][1];
 	__declspec(align(64)) VUINT32 poly_coeff1[16][1];
-	__declspec(align(64)) VUINT32 poly_coeff0[16][1];
-	__declspec(align(64)) VUINT32 Half[16][1];
-	__declspec(align(64)) VUINT32 L2H[16][1];
-	__declspec(align(64)) VUINT32 L2L[16][1];
 } __svml_satanh_data_internal_avx512;
 #endif
 __svml_satanh_data_internal_avx512:
-	/* Log_tbl_H */
-	.long	0x00000000
-	.long	0x3cfc0000
-	.long	0x3d780000
-	.long	0x3db78000
-	.long	0x3df10000
-	.long	0x3e14c000
-	.long	0x3e300000
-	.long	0x3e4a8000
-	.long	0x3e648000
-	.long	0x3e7dc000
-	.long	0x3e8b4000
-	.long	0x3e974000
-	.long	0x3ea30000
-	.long	0x3eae8000
-	.long	0x3eb9c000
-	.long	0x3ec4e000
-	.long	0x3ecfa000
-	.long	0x3eda2000
-	.long	0x3ee48000
-	.long	0x3eeea000
-	.long	0x3ef8a000
-	.long	0x3f013000
-	.long	0x3f05f000
-	.long	0x3f0aa000
-	.long	0x3f0f4000
-	.long	0x3f13d000
-	.long	0x3f184000
-	.long	0x3f1ca000
-	.long	0x3f20f000
-	.long	0x3f252000
-	.long	0x3f295000
-	.long	0x3f2d7000
-	/* Log_tbl_L */
+	/* Leave this at front so we can potentially save space due to
+	   smaller alignment constraint.  */
+	.align	4
+    /* AbsMask */
+	.long	0x7fffffff
+	.align	64
+__svml_satanh_data_internal_avx512_al64:
+	/* One */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* AddB5 */
+	.align	64
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	/* RcpBitMask */
+	.align	64
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	/* Log_tbl_L_lo */
 	.align	64
 	.long	0x00000000
 	.long	0x3726c39e
@@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38dedfac
 	.long	0x38ebfb5e
 	.long	0xb8e63c9f
+	/* Log_tbl_L_hi */
+	.align	64
 	.long	0xb85c1340
 	.long	0x38777bcd
 	.long	0xb6038656
@@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
 	.long	0x38f85db0
 	.long	0x37b4996f
 	.long	0xb8bfb3ca
-	/* One */
+	/* Log_tbl_H_lo */
 	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* AbsMask */
+	.long	0x00000000
+	.long	0x3cfc0000
+	.long	0x3d780000
+	.long	0x3db78000
+	.long	0x3df10000
+	.long	0x3e14c000
+	.long	0x3e300000
+	.long	0x3e4a8000
+	.long	0x3e648000
+	.long	0x3e7dc000
+	.long	0x3e8b4000
+	.long	0x3e974000
+	.long	0x3ea30000
+	.long	0x3eae8000
+	.long	0x3eb9c000
+	.long	0x3ec4e000
+	/* Log_tbl_H_hi */
 	.align	64
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* AddB5 */
+	.long	0x3ecfa000
+	.long	0x3eda2000
+	.long	0x3ee48000
+	.long	0x3eeea000
+	.long	0x3ef8a000
+	.long	0x3f013000
+	.long	0x3f05f000
+	.long	0x3f0aa000
+	.long	0x3f0f4000
+	.long	0x3f13d000
+	.long	0x3f184000
+	.long	0x3f1ca000
+	.long	0x3f20f000
+	.long	0x3f252000
+	.long	0x3f295000
+	.long	0x3f2d7000
+	/* L2H = log(2)_high */
 	.align	64
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
-	/* RcpBitMask */
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	/* L2L = log(2)_low */
 	.align	64
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
 	/* poly_coeff3 */
 	.align	64
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
 	/* poly_coeff2 */
 	.align	64
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
 	/* poly_coeff1 */
 	.align	64
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	/* poly_coeff0 */
-	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* Half */
-	.align	64
-	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
-	/* L2H = log(2)_high */
-	.align	64
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	/* L2L = log(2)_low */
-	.align	64
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
 	.align	64
+	.type	__svml_satanh_data_internal_avx512_al64, @object
+	.size	__svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
 	.type	__svml_satanh_data_internal_avx512, @object
 	.size	__svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v4 2/7] x86: Improve svml_s_atanhf8_core_avx2.S
  2022-06-09 18:16 ` [PATCH v4 " Noah Goldstein
@ 2022-06-09 18:16   ` Noah Goldstein
  2022-06-09 19:34     ` H.J. Lu
  2022-06-09 18:16   ` [PATCH v4 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
  2022-06-09 19:33   ` [PATCH v4 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
  2 siblings, 1 reply; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 18:16 UTC (permalink / raw)
  To: libc-alpha

Improvements are:
    1. Reduce code size (-60 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Shrink rodata usage (-32 bytes).

The throughput improvement is not that significant (3-5%) as the
port 0 bottleneck is unavoidable.

       Function, New Time, Old Time, New / Old
_ZGVdN8v_atanhf,    2.799,    2.923,     0.958
---
 .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 405 +++++++++---------
 1 file changed, 202 insertions(+), 203 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index c1ea1c3353..43eb423831 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -30,305 +30,304 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
+/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+   by use in the function. On cold-starts this might hhelp the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
 #define SgnMask				0
 #define sOne				32
-#define sPoly				64
-#define iBrkValue			320
-#define iOffExpoMask			352
-#define sHalf				384
-#define sSign				416
-#define sTopMask12			448
-#define TinyRange			480
-#define sLn2				512
+#define sTopMask12			64
+#define TinyRange			96
+#define iBrkValue			128
+#define iOffExpoMask			160
+#define sPoly				192
+#define sLn2				448
+#define sHalf				480
 
 #include <sysdep.h>
+#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_atanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	subq	$96, %rsp
-
+	/* Strip off the sign, so treat X as positive until right at the end */
+	vmovaps	ATANHF_DATA(SgnMask)(%rip), %ymm2
+	vandps	%ymm2, %ymm0, %ymm3
 	/* Load constants including One = 1 */
-	vmovups	sOne+__svml_satanh_data_internal(%rip), %ymm5
-	vmovups	sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
-	vmovaps	%ymm0, %ymm6
+	vmovups	ATANHF_DATA(sOne)(%rip), %ymm5
+	vsubps	%ymm3, %ymm5, %ymm1
+	vmovups	ATANHF_DATA(sTopMask12)(%rip), %ymm4
 
-	/* Strip off the sign, so treat X as positive until right at the end */
-	vandps	SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
-	vsubps	%ymm10, %ymm5, %ymm1
+	vrcpps	%ymm1, %ymm7
+	vsubps	%ymm1, %ymm5, %ymm9
+	vandps	%ymm4, %ymm7, %ymm6
+	vsubps	%ymm3, %ymm9, %ymm7
 
-	/*
-	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
-	 * the upper part UHi being <= 12 bits long. Then we have
-	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
-	 */
-	vaddps	%ymm10, %ymm10, %ymm14
+	/* No need to split sU when FMA is available */
+	vfnmadd213ps %ymm5, %ymm6, %ymm1
+	vmovaps	%ymm0, %ymm8
+	vfmadd213ps %ymm0, %ymm0, %ymm0
+	vfnmadd231ps %ymm6, %ymm7, %ymm1
 
 	/*
 	 * Check whether |X| < 1, in which case we use the main function.
 	 * Otherwise set the rangemask so that the callout will get used.
 	 * Note that this will also use the callout for NaNs since not(NaN < 1).
 	 */
-	vcmpnlt_uqps %ymm5, %ymm10, %ymm7
-	vsubps	%ymm1, %ymm5, %ymm9
-	vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
-	vrcpps	%ymm1, %ymm11
-	vsubps	%ymm10, %ymm9, %ymm12
-	vandps	%ymm13, %ymm11, %ymm0
+	vcmpnlt_uqps %ymm5, %ymm3, %ymm14
+	vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
 
-	/* No need to split sU when FMA is available */
-	vfnmadd213ps %ymm5, %ymm0, %ymm1
-	vmovaps	%ymm6, %ymm8
-	vfmadd213ps %ymm6, %ymm6, %ymm8
-	vfnmadd231ps %ymm0, %ymm12, %ymm1
+	/*
+	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+	 * the upper part UHi being <= 12 bits long. Then we have
+	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
+	 */
+	vaddps	%ymm3, %ymm3, %ymm3
 
 	/*
 	 * Split V as well into upper 12 bits and lower part, so that we can get
 	 * a preliminary quotient estimate without rounding error.
 	 */
-	vandps	%ymm13, %ymm14, %ymm15
-	vmovmskps %ymm7, %edx
-	vsubps	%ymm15, %ymm14, %ymm7
+	vandps	%ymm4, %ymm3, %ymm4
+	vsubps	%ymm4, %ymm3, %ymm7
 
 	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-	vmulps	%ymm15, %ymm0, %ymm10
+	vmulps	%ymm4, %ymm6, %ymm4
 
 	/* Compute D = E + E^2 */
 	vfmadd213ps %ymm1, %ymm1, %ymm1
 
-	/* Record the sign for eventual reincorporation. */
-	vandps	sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
+	/* Record the sign for eventual reincorporation.  */
+	vandnps	%ymm8, %ymm2, %ymm3
 
 	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-	vorps	%ymm3, %ymm8, %ymm2
-	vmulps	%ymm7, %ymm0, %ymm8
+	vorps	%ymm3, %ymm0, %ymm13
+	vmulps	%ymm7, %ymm6, %ymm2
 
 	/*
 	 * Compute R * (VHi + VLo) * (1 + E + E^2)
 	 * = R *  (VHi + VLo) * (1 + D)
 	 * = QHi + (QHi * D + QLo + QLo * D)
 	 */
-	vmulps	%ymm1, %ymm10, %ymm9
-	vfmadd213ps %ymm8, %ymm8, %ymm1
-	vaddps	%ymm1, %ymm9, %ymm1
 
-	/* reduction: compute r, n */
-	vmovups	iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
+	/*
+	 * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
+	 * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
+	 * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
+	 */
+	vmulps	%ymm1, %ymm4, %ymm6
+	vfmadd213ps %ymm2, %ymm2, %ymm1
+	vaddps	%ymm1, %ymm6, %ymm1
 
 	/*
 	 * Now finally accumulate the high and low parts of the
 	 * argument to log1p, H + L, with a final compensated summation.
 	 */
-	vaddps	%ymm1, %ymm10, %ymm12
-	vsubps	%ymm12, %ymm10, %ymm11
+	vaddps	%ymm1, %ymm4, %ymm2
+
+	/* reduction: compute r, n */
+	vmovups	ATANHF_DATA(iBrkValue)(%rip), %ymm9
 
 	/*
 	 * Now we feed into the log1p code, using H in place of _VARG1 and
 	 * later incorporating L into the reduced argument.
 	 * compute 1+x as high, low parts
 	 */
-	vmaxps	%ymm12, %ymm5, %ymm13
-	vminps	%ymm12, %ymm5, %ymm14
-	vaddps	%ymm11, %ymm1, %ymm0
-	vaddps	%ymm14, %ymm13, %ymm1
-	vpsubd	%ymm9, %ymm1, %ymm7
-	vsubps	%ymm1, %ymm13, %ymm15
-	vpsrad	$23, %ymm7, %ymm10
-	vpand	iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
-	vaddps	%ymm15, %ymm14, %ymm13
-	vpslld	$23, %ymm10, %ymm11
-	vpaddd	%ymm9, %ymm8, %ymm15
-	vaddps	%ymm13, %ymm0, %ymm14
-	vcvtdq2ps %ymm10, %ymm0
-	vpsubd	%ymm11, %ymm5, %ymm12
+	vmaxps	%ymm2, %ymm5, %ymm0
+	vminps	%ymm2, %ymm5, %ymm6
+
+	/* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
+	vsubps	%ymm2, %ymm4, %ymm2
+	vaddps	%ymm6, %ymm0, %ymm4
+	vpsubd	%ymm9, %ymm4, %ymm7
+	vsubps	%ymm4, %ymm0, %ymm4
+	vaddps	%ymm2, %ymm1, %ymm2
+	vmovaps	ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
+
+	vandps	%ymm1, %ymm7, %ymm0
+	vaddps	%ymm4, %ymm6, %ymm4
+	vandnps	%ymm7, %ymm1, %ymm6
+	vmovups	ATANHF_DATA(sPoly+0)(%rip), %ymm1
+	vpaddd	%ymm9, %ymm0, %ymm0
+	vaddps	%ymm4, %ymm2, %ymm4
+	vpsubd	%ymm6, %ymm5, %ymm6
 
 	/* polynomial evaluation */
-	vsubps	%ymm5, %ymm15, %ymm5
-	vmulps	%ymm14, %ymm12, %ymm1
-	vaddps	%ymm5, %ymm1, %ymm5
-	vmovups	sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
-	vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-	vmulps	%ymm1, %ymm5, %ymm7
-	vfmadd213ps %ymm5, %ymm5, %ymm7
+	vsubps	%ymm5, %ymm0, %ymm2
+	vfmadd231ps %ymm4, %ymm6, %ymm2
+	vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
+	vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
+
+	vmulps	%ymm1, %ymm2, %ymm1
+	vfmadd213ps %ymm2, %ymm2, %ymm1
 
 	/* final reconstruction */
-	vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
+	vpsrad	$23, %ymm7, %ymm6
+	vcvtdq2ps %ymm6, %ymm2
+	vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
 
 	/* Finally, halve the result and reincorporate the sign */
-	vxorps	sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
-	vmulps	%ymm0, %ymm3, %ymm0
-	vblendvps %ymm4, %ymm2, %ymm0, %ymm0
+	vxorps	ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
+	vmulps	%ymm2, %ymm3, %ymm2
+	vmovmskps %ymm14, %edx
 	testl	%edx, %edx
 
+	vblendvps %ymm15, %ymm13, %ymm2, %ymm0
 	/* Go to special inputs processing branch */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
-
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	# LOE rbx rdx r12 r13 r14 r15 ymm0
+	/* No registers to restore on fast path.  */
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here. */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm6, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx ymm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
+	/* Save all already computed inputs.  */
+	vmovups	%ymm0, (%rsp)
+	/* Save original input (ymm8 unchanged up to this point).  */
+	vmovups	%ymm8, 32(%rsp)
 
-	/* Special inputs
-	 * processing loop
-	 */
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   atanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
+
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* All results have been written to (%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
 END(_ZGVdN8v_atanhf_avx2)
 
 	.section .rodata, "a"
 	.align	32
-
 #ifdef __svml_satanh_data_internal_typedef
 typedef unsigned int VUINT32;
-typedef struct {
+typedef struct{
 	__declspec(align(32)) VUINT32 SgnMask[8][1];
 	__declspec(align(32)) VUINT32 sOne[8][1];
-	__declspec(align(32)) VUINT32 sPoly[8][8][1];
-	__declspec(align(32)) VUINT32 iBrkValue[8][1];
-	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-	__declspec(align(32)) VUINT32 sHalf[8][1];
-	__declspec(align(32)) VUINT32 sSign[8][1];
 	__declspec(align(32)) VUINT32 sTopMask12[8][1];
 	__declspec(align(32)) VUINT32 TinyRange[8][1];
+	__declspec(align(32)) VUINT32 iBrkValue[8][1];
+	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
+	__declspec(align(32)) VUINT32 sPoly[8][8][1];
 	__declspec(align(32)) VUINT32 sLn2[8][1];
+	__declspec(align(32)) VUINT32 sHalf[8][1];
 } __svml_satanh_data_internal;
 #endif
 __svml_satanh_data_internal:
 	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 	/* sOne = SP 1.0 */
 	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sPoly[] = SP polynomial */
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* sTopMask12 */
+	.align	32
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	/* TinyRange */
 	.align	32
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
 	/* iBrkValue = SP 2/3 */
 	.align	32
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 	/* iOffExpoMask = SP significand mask */
 	.align	32
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sHalf */
-	.align	32
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	/* sSign */
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
-	/* sTopMask12 */
-	.align	32
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	/* TinyRange */
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	/* sPoly[] = SP polynomial */
 	.align	32
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
 	/* sLn2 = SP ln(2) */
 	.align	32
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	/* sHalf */
+	.align	32
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
 	.align	32
 	.type	__svml_satanh_data_internal, @object
 	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v4 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S
  2022-06-09 18:16 ` [PATCH v4 " Noah Goldstein
  2022-06-09 18:16   ` [PATCH v4 2/7] x86: Improve svml_s_atanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 18:16   ` Noah Goldstein
  2022-06-09 19:33   ` [PATCH v4 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
  2 siblings, 0 replies; 48+ messages in thread
From: Noah Goldstein @ 2022-06-09 18:16 UTC (permalink / raw)
  To: libc-alpha

Optimizations are:
    1. Reduce code size (-81 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Prefer registers which get short instruction encoding.
    5. Reduce rodata size (-32 bytes).

Result is roughly a 17-18% speedup:

       Function, New Time, Old Time, New / Old
_ZGVdN8v_tanhf,     1.977,    2.402,     0.823
---
 .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 912 ++++--------------
 1 file changed, 171 insertions(+), 741 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index c5c87bf5b0..55df346a00 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -70,773 +70,203 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP				0
-#define _sSignMask			4288
-#define _sAbsMask			4320
-#define _iExpMantMask			4352
-#define _iExpMask			4384
-#define _iMinIdxOfsMask			4416
-#define _iMaxIdxMask			4448
-
 #include <sysdep.h>
 
+/* tanhf data tables for avx2 and sse4 implementatins defined here.
+ */
+#include "svml_s_tanhf_rodata.S"
+
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_tanhf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	pushq	%r12
-	subq	$120, %rsp
-	lea	_dbP+16+__svml_stanh_data_internal(%rip), %r10
-	vmovaps	%ymm0, %ymm12
-
 	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpand	_iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
+	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
+	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
+
+	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
+	vpxor	%ymm3, %ymm3, %ymm3
+	vpmaxsd	%ymm3, %ymm2, %ymm2
+	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
 
 	/*
 	 *  small table specific variables *
 	 *  Constant loading
 	 */
-	vmovups	_iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
-	vpsubd	_iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
-
-	/* if VMIN, VMAX is defined for I type */
-	vxorps	%ymm15, %ymm15, %ymm15
-	vpcmpgtd %ymm15, %ymm9, %ymm0
-	vpand	%ymm0, %ymm9, %ymm7
-	vpcmpgtd %ymm8, %ymm9, %ymm6
-	vblendvps %ymm6, %ymm8, %ymm7, %ymm3
-	vpsrld	$14, %ymm3, %ymm1
-	vpcmpgtd _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
-	vmovmskps %ymm13, %r11d
-	vandps	_sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
-	vandps	_sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
-	vextractf128 $1, %ymm1, %xmm2
-	vmovd	%xmm1, %r9d
-	vmovd	%xmm2, %ecx
-	vpextrd	$1, %xmm2, %edx
-	vpextrd	$1, %xmm1, %r8d
-	movslq	%r9d, %r9
-	movslq	%edx, %rdx
-	movslq	%r8d, %r8
-	vpextrd	$2, %xmm1, %edi
-	movslq	%ecx, %rcx
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	vpextrd	$3, %xmm2, %r12d
-	vpextrd	$3, %xmm1, %esi
-	vpextrd	$2, %xmm2, %eax
-	movslq	%edi, %rdi
-	movslq	%r12d, %r12
-	movslq	%esi, %rsi
-	movslq	%eax, %rax
-	vmovupd	-16(%r9, %r10), %xmm5
-	vmovupd	-16(%rdx, %r10), %xmm14
-	vmovupd	-16(%rcx, %r10), %xmm13
-	vmovupd	(%r9, %r10), %xmm1
-	vmovupd	(%r8, %r10), %xmm2
-	vmovupd	-16(%r8, %r10), %xmm4
-	vinsertf128 $1, -16(%rdi, %r10), %ymm5, %ymm15
-	vinsertf128 $1, -16(%r12, %r10), %ymm14, %ymm3
-	vinsertf128 $1, -16(%rax, %r10), %ymm13, %ymm6
-	vinsertf128 $1, (%rdi, %r10), %ymm1, %ymm5
-	vinsertf128 $1, (%rsi, %r10), %ymm2, %ymm14
-	vunpcklpd %ymm3, %ymm6, %ymm8
+	vpsrld	$14, %ymm2, %ymm1
+
+	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
+	   store/load as we can take advantage of store-forwarding.  */
+	vmovq	%xmm1, %r8
+	/* We have eliminated all negative values for ymm1 so no need to sign
+	   extend.  */
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+
+	/* Store base of lookup table in rax.  */
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+
+	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
+	   with memory operand. This helps alleviate bottleneck on p5.  */
+	vmovupd	16(%r9, %rax), %xmm5
+
+	vpextrq	$1, %xmm1, %rsi
+	movl	%esi, %edi
+	shrq	$32, %rsi
+
+	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
+
+	vextracti128 $1, %ymm1, %xmm2
+	vmovq	%xmm2, %rdx
+	movl	%edx, %ecx
+	shrq	$32, %rdx
+
+	vmovupd	(%rcx, %rax), %xmm6
+
+	vpextrq	$1, %xmm2, %r10
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+
+	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
+
+	vmovupd	16(%r8, %rax), %xmm1
+	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
+	vmovupd	(%rdx, %rax), %xmm3
+	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm6, %ymm7
 	vunpckhpd %ymm3, %ymm6, %ymm6
-	vunpcklpd %ymm14, %ymm5, %ymm3
-	vunpckhpd %ymm14, %ymm5, %ymm2
-	vmovupd	(%rcx, %r10), %xmm13
-	vcvtps2pd %xmm10, %ymm5
-	vextractf128 $1, %ymm10, %xmm10
-	vfmadd213pd %ymm3, %ymm5, %ymm2
-	vinsertf128 $1, -16(%rsi, %r10), %ymm4, %ymm0
-	vmovupd	(%rdx, %r10), %xmm4
-	vunpcklpd %ymm0, %ymm15, %ymm9
-	vunpckhpd %ymm0, %ymm15, %ymm7
-	vfmadd213pd %ymm7, %ymm5, %ymm2
-	vfmadd213pd %ymm9, %ymm5, %ymm2
-	vinsertf128 $1, (%r12, %r10), %ymm4, %ymm0
-	vcvtps2pd %xmm10, %ymm4
-	vinsertf128 $1, (%rax, %r10), %ymm13, %ymm15
-	vunpcklpd %ymm0, %ymm15, %ymm1
-	vunpckhpd %ymm0, %ymm15, %ymm0
-	vfmadd213pd %ymm1, %ymm4, %ymm0
-	vcvtpd2ps %ymm2, %xmm1
-	vfmadd213pd %ymm6, %ymm4, %ymm0
-	vfmadd213pd %ymm8, %ymm4, %ymm0
-	vcvtpd2ps %ymm0, %xmm0
-	vinsertf128 $1, %xmm0, %ymm1, %ymm2
-	vorps	%ymm11, %ymm2, %ymm0
-	testl	%r11d, %r11d
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r13 r14 r15 r11d ymm0 ymm12
+	vunpcklpd %ymm1, %ymm5, %ymm3
+	vunpckhpd %ymm1, %ymm5, %ymm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
+	/* Store special cases in ymm15.  */
+	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
 
-L(EXIT):
-	addq	$120, %rsp
-	cfi_restore(12)
-	popq	%r12
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
+	vandps	%ymm11, %ymm0, %ymm4
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vcvtps2pd %xmm4, %ymm5
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm12, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE rbx r13 r14 r15 r11d ymm0
+	vextractf128 $1, %ymm4, %xmm4
+	vcvtps2pd %xmm4, %ymm4
 
-	xorl	%r12d, %r12d
-	# LOE rbx r13 r14 r15 r11d r12d
+	vmovupd	16(%rcx, %rax), %xmm2
+	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
 
-	vzeroupper
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	movl	%r11d, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
+	vfmadd213pd %ymm3, %ymm5, %ymm1
+
+	vmovupd	16(%rdx, %rax), %xmm3
+	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm2, %ymm10
+	vunpckhpd %ymm3, %ymm2, %ymm2
+
+	vfmadd213pd %ymm10, %ymm4, %ymm2
+	vfmadd213pd %ymm6, %ymm4, %ymm2
+	vfmadd213pd %ymm7, %ymm4, %ymm2
+	vcvtpd2ps %ymm2, %xmm2
+
+	vmovupd	(%r9, %rax), %xmm7
+	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
+
+	vmovupd	(%r8, %rax), %xmm3
+	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
+
+	vunpckhpd %ymm3, %ymm7, %ymm4
+	vunpcklpd %ymm3, %ymm7, %ymm7
 
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
+	vfmadd213pd %ymm4, %ymm5, %ymm1
+	vfmadd213pd %ymm7, %ymm5, %ymm1
+
+
+	vcvtpd2ps %ymm1, %xmm1
+	vinsertf128 $1, %xmm2, %ymm1, %ymm1
+
+	vmovmskps %ymm15, %edx
+	vandnps	%ymm0, %ymm11, %ymm2
+	testl	%edx, %edx
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
+	/* Wait until after branch of write over ymm0.  */
+	vorps	%ymm2, %ymm1, %ymm0
+	/* No stack restoration on the fastpath.  */
+	ret
 
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
 
-	/* Special inputs
-	 * processing loop
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   more so than speed here. */
+L(SPECIAL_VALUES_BRANCH):
+	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
+    /* Use r13 to save/restore the stack. This allows us to use rbp as
+       callee save register saving code size. */
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(r13, -16)
+	/* Need to callee save registers to preserve state across tanhf calls.
 	 */
+	pushq	%rbx
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset(8)
+	cfi_offset(rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register(r13)
+
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save all already computed inputs.  */
+	vorps	%ymm2, %ymm1, %ymm1
+	vmovaps	%ymm1, (%rsp)
+	/* Save original input (ymm0 unchanged up to this point).  */
+	vmovaps	%ymm0, 32(%rsp)
+
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
+	# LOE rbx rbp r12 r13 r14 r15
+	/* use rbp as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop. Realigning also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
 
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	/* Scalar math function call to process special input.  */
+	movss	32(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl   %ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+	# LOE r12 r13 r14 r15
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
-END(_ZGVdN8v_tanhf_avx2)
 
-	.section .rodata, "a"
-	.align	32
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 _dbP[(134*4)][2];
-	__declspec(align(32)) VUINT32 _sSignMask[8][1];
-	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMask[8][1];
-	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
-	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01 */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00 */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06 */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01 */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08 */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00 */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05 */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01 */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08 */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00 */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04 */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01 */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08 */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00 */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04 */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01 */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08 */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00 */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04 */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01 */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08 */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00 */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04 */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01 */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08 */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00 */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04 */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01 */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08 */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00 */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04 */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01 */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07 */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00 */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04 */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01 */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07 */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00 */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04 */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01 */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07 */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00 */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04 */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01 */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07 */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00 */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04 */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01 */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07 */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00 */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04 */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01 */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07 */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00 */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04 */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01 */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07 */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00 */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04 */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01 */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07 */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00 */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04 */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01 */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07 */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00 */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04 */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01 */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07 */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00 */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04 */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01 */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07 */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00 */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04 */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01 */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06 */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00 */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04 */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01 */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06 */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00 */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03 */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01 */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06 */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00 */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03 */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01 */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06 */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00 */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03 */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01 */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06 */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00 */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03 */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01 */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06 */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00 */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03 */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01 */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06 */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00 */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03 */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01 */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06 */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00 */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03 */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01 */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06 */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00 */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03 */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01 */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06 */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00 */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03 */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01 */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06 */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00 */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03 */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01 */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05 */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00 */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03 */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01 */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05 */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00 */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03 */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01 */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05 */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00 */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03 */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01 */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05 */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00 */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03 */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01 */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05 */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00 */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03 */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01 */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05 */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00 */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03 */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01 */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05 */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00 */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03 */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01 */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05 */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00 */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02 */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01 */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05 */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00 */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02 */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01 */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05 */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00 */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02 */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01 */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04 */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00 */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02 */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01 */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04 */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00 */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02 */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01 */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04 */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00 */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02 */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01 */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04 */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00 */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02 */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01 */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04 */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00 */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02 */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01 */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04 */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00 */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02 */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01 */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04 */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00 */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02 */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01 */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04 */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00 */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02 */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01 */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04 */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00 */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02 */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01 */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04 */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00 */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02 */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01 */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04 */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00 */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02 */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01 */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04 */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00 */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02 */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01 */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03 */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00 */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02 */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01 */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03 */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00 */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02 */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01 */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03 */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00 */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02 */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01 */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03 */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00 */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02 */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01 */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03 */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00 */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01 */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01 */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03 */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00 */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01 */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01 */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03 */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00 */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01 */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01 */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03 */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00 */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01 */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01 */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03 */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00 */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01 */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01 */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03 */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00 */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01 */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01 */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03 */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00 */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01 */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01 */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03 */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00 */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01 */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01 */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03 */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00 */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01 */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02 */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02 */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00 */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01 */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02 */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02 */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00 */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01 */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02 */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02 */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00 */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01 */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02 */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02 */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00 */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01 */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02 */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02 */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00 */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01 */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03 */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02 */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00 */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01 */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03 */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02 */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00 */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01 */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02 */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02 */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00 */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01 */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02 */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02 */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00 */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01 */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02 */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02 */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00 */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01 */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02 */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02 */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00 */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01 */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02 */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02 */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00 */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01 */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02 */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02 */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00 */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01 */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02 */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02 */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00 */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01 */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02 */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02 */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00 */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01 */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02 */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02 */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00 */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01 */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01 */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02 */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00 */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01 */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01 */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02 */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00 */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01 */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01 */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02 */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00 */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01 */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01 */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02 */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00 */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01 */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01 */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02 */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00 */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01 */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01 */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02 */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00 */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01 */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01 */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02 */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00 */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01 */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02 */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02 */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00 */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01 */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02 */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02 */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00 */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01 */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02 */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03 */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00 */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01 */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02 */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02 */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00 */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01 */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02 */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02 */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00 */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01 */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02 */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02 */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00 */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01 */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02 */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01 */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01 */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01 */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02 */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01 */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01 */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01 */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02 */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01 */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01 */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01 */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02 */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01 */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01 */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01 */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02 */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01 */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01 */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01 */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02 */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01 */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01 */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01 */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02 */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01 */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01 */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01 */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02 */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01 */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01 */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01 */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02 */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01 */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01 */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01 */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02 */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01 */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01 */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02 */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03 */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01 */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01 */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02 */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03 */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01 */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01 */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02 */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03 */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01 */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01 */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02 */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03 */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01 */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01 */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02 */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03 */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01 */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01 */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02 */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03 */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01 */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01 */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02 */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03 */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01 */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02 */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02 */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03 */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01 */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02 */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02 */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03 */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01 */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02 */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02 */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03 */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01 */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02 */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03 */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04 */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01 */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02 */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03 */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04 */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01 */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02 */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03 */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04 */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01 */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02 */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03 */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04 */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01 */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03 */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03 */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05 */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01 */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03 */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03 */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05 */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01 */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03 */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04 */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05 */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01 */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03 */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04 */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05 */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01 */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03 */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04 */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05 */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01 */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03 */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04 */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06 */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01 */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04 */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04 */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06 */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01 */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04 */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05 */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06 */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01 */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04 */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05 */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06 */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01 */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04 */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05 */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06 */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01 */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04 */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05 */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07 */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01 */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05 */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06 */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07 */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01 */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05 */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06 */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07 */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01 */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05 */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06 */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08 */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01 */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06 */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07 */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08 */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01 */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06 */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07 */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09 */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	32
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
-	.align	32
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask */
-	.align	32
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000 /* _iExpMask */
-	.align	32
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000 /* _iMinIdxOfsMask */
-	.align	32
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000 /* _iMaxIdxMask */
-	.align	32
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+	/* All results have been written to (%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register(rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(r13)
+	ret
+END(_ZGVdN8v_tanhf_avx2)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v4 1/7] x86: Improve svml_s_atanhf16_core_avx512.S
  2022-06-09 18:16 ` [PATCH v4 " Noah Goldstein
  2022-06-09 18:16   ` [PATCH v4 2/7] x86: Improve svml_s_atanhf8_core_avx2.S Noah Goldstein
  2022-06-09 18:16   ` [PATCH v4 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 19:33   ` H.J. Lu
  2 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 19:33 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 11:16 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvements are:
>     1. Reduce code size (-64 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Reduce rodata size ([-128, -188] bytes).
>
> The throughput improvement is not significant as the port 0 bottleneck
> is unavoidable.
>
>         Function, New Time, Old Time, New / Old
> _ZGVeN16v_atanhf,     1.39,    1.408,     0.987
> ---
>  .../multiarch/svml_s_atanhf16_core_avx512.S   | 474 +++++++++---------
>  1 file changed, 244 insertions(+), 230 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> index a1cd920a0f..f42462c581 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> @@ -31,53 +31,50 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal_avx512
> - */
> -#define Log_tbl_H                      0
> -#define Log_tbl_L                      128
> -#define One                            256
> -#define AbsMask                                320
> -#define AddB5                          384
> -#define RcpBitMask                     448
> -#define poly_coeff3                    512
> -#define poly_coeff2                    576
> -#define poly_coeff1                    640
> -#define poly_coeff0                    704
> -#define Half                           768
> -#define L2H                            832
> -#define L2L                            896
> +/* Offsets for data table __svml_satanh_data_internal_avx512 and
> +   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
> +   function. On cold-starts this might help the prefetcher. Possibly
> +   a better idea is to interleave start/end so that the prefetcher is
> +   less likely to detect a stream and pull irrelivant lines into
> +   cache.  */
> +
> +/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
> +   the memory is broadcast to {1to16}.  */
> +#define AbsMask                                0
> +
> +/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
> +   is used here.  */
> +#define One                            0
> +#define AddB5                          64
> +#define RcpBitMask                     128
> +#define Log_tbl_L_lo                   192
> +#define Log_tbl_L_hi                   256
> +#define Log_tbl_H_lo                   320
> +#define Log_tbl_H_hi                   384
> +#define L2H                            448
> +#define L2L                            512
> +#define poly_coeff3                    576
> +#define poly_coeff2                    640
> +#define poly_coeff1                    704
>
>  #include <sysdep.h>
>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal_avx512_al64)
> +
>         .section .text.exex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_atanhf_skx)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-64, %rsp
> -       subq    $192, %rsp
> -       vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> -
> -       /* round reciprocals to 1+5b mantissas */
> -       vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> -       vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> -       vmovaps %zmm0, %zmm11
> -       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> +       vandps  AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
> +       vmovups ATANHF_DATA(One)(%rip), %zmm4
>
>         /* 1+y */
>         vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
>
>         /* 1-y */
>         vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> -       vxorps  %zmm6, %zmm11, %zmm10
> -
> -       /* Yp_high */
> -       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
>
> -       /* -Ym_high */
> -       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +       /* round reciprocals to 1+5b mantissas */
> +       vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
> +       vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
>
>         /* RcpP ~ 1/Yp */
>         vrcp14ps %zmm9, %zmm12
> @@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
>         /* RcpM ~ 1/Ym */
>         vrcp14ps %zmm8, %zmm13
>
> +       /* Yp_high */
> +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> +
> +       /* -Ym_high */
> +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +
> +
>         /* input outside (-1, 1) ? */
> -       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
>         vpaddd  %zmm14, %zmm12, %zmm15
> -       vpaddd  %zmm14, %zmm13, %zmm0
> +       vpaddd  %zmm14, %zmm13, %zmm12
>
>         /* Yp_low */
>         vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
>         vandps  %zmm1, %zmm15, %zmm7
> -       vandps  %zmm1, %zmm0, %zmm12
> +       vandps  %zmm1, %zmm12, %zmm12
>
>         /* Ym_low */
>         vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> @@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
>         vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
>
>         /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> -       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> -       vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> -       vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> +       vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
> +
> +       vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
> +       vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
>
>         /* exponents */
> -       vgetexpps {sae}, %zmm7, %zmm15
>         vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> +       vgetexpps {sae}, %zmm7, %zmm15
> +
>
>         /* Table lookups */
> -       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
>         vgetexpps {sae}, %zmm12, %zmm14
> -       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> +
>
>         /* Prepare table index */
>         vpsrld  $18, %zmm7, %zmm3
>         vpsrld  $18, %zmm12, %zmm2
> -       vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -       vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> -
> +       vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
> +       vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
>         /* Km-Kp */
> +
> +       vmovaps %zmm3, %zmm5
> +       vpermi2ps %zmm13, %zmm10, %zmm3
> +       vpermt2ps %zmm13, %zmm2, %zmm10
> +       vpermi2ps %zmm7, %zmm11, %zmm5
> +       vpermt2ps %zmm7, %zmm2, %zmm11
>         vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> -       kmovw   %k0, %edx
> -       vmovaps %zmm3, %zmm0
> -       vpermi2ps %zmm13, %zmm8, %zmm3
> -       vpermt2ps %zmm13, %zmm2, %zmm8
> -       vpermi2ps %zmm7, %zmm6, %zmm0
> -       vpermt2ps %zmm7, %zmm2, %zmm6
> -       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> +       vsubps  {rn-sae}, %zmm3, %zmm10, %zmm7
>
>         /* K*L2H + Th */
> -       vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> +       vmovups ATANHF_DATA(L2H)(%rip), %zmm2
>
>         /* K*L2L + Tl */
> -       vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -
> -       /* polynomials */
> -       vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -       vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> +       vmovups ATANHF_DATA(L2L)(%rip), %zmm3
>
>         /* table values */
> -       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> -       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> -       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> -       vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -       vmovaps %zmm3, %zmm2
> -       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> -       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> -       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> -       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> +       vsubps  {rn-sae}, %zmm5, %zmm11, %zmm5
> +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
> +       vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
> +       /* polynomials */
> +       vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
> +       vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
> +       vmovaps %zmm10, %zmm14
> +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
> +       vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
> +       vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
>
>         /* (K*L2L + Tl) + Rp*PolyP */
> -       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> -       vorps   Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
> +
> +       /* zmm12 = zmm12 & (zmm4 | zmm0).  */
> +       vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
>
>         /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> -       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> -       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> -       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> +       vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
> +       vaddps  {rn-sae}, %zmm14, %zmm10, %zmm8
> +
> +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> +       kmovw   %k0, %edx
>         testl   %edx, %edx
>
>         /* Go to special inputs processing branch */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> +       # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm0
>
> -       /* Restore registers
> -        * and exit the function
> -        */
> -
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> +       /* No register to restore on fast path.  */
>         ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          more so than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       vmovups %zmm11, 64(%rsp)
> -       vmovups %zmm0, 128(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx zmm0
> -
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> -
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> +       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> -
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Special inputs
> -        * processing loop
> -        */
> +       /* Align stack and make room for 2x zmm vectors.  */
> +       andq    $-64, %rsp
> +       addq    $-128, %rsp
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm1
> +       vmovaps %zmm1, (%rsp)
> +       vmovaps %zmm0, 64(%rsp)
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a atanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $16, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 128(%rsp), %zmm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 zmm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          atanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   64(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
> -
> -       movss   %xmm0, 128(%rsp, %r14, 4)
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
> +       /* All results have been written to (%rsp).  */
> +       vmovaps (%rsp), %zmm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVeN16v_atanhf_skx)
>
>         .section .rodata, "a"
> -       .align  64
> -
> +       .align  4
>  #ifdef __svml_satanh_data_internal_avx512_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> -       __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> -       __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> +typedef struct{
> +       __declspec(align(4)) VUINT32 AbsMask[1][1];
>         __declspec(align(64)) VUINT32 One[16][1];
> -       __declspec(align(64)) VUINT32 AbsMask[16][1];
>         __declspec(align(64)) VUINT32 AddB5[16][1];
>         __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
> +       __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
> +       __declspec(align(64)) VUINT32 L2H[16][1];
> +       __declspec(align(64)) VUINT32 L2L[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff3[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff2[16][1];
>         __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> -       __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> -       __declspec(align(64)) VUINT32 Half[16][1];
> -       __declspec(align(64)) VUINT32 L2H[16][1];
> -       __declspec(align(64)) VUINT32 L2L[16][1];
>  } __svml_satanh_data_internal_avx512;
>  #endif
>  __svml_satanh_data_internal_avx512:
> -       /* Log_tbl_H */
> -       .long   0x00000000
> -       .long   0x3cfc0000
> -       .long   0x3d780000
> -       .long   0x3db78000
> -       .long   0x3df10000
> -       .long   0x3e14c000
> -       .long   0x3e300000
> -       .long   0x3e4a8000
> -       .long   0x3e648000
> -       .long   0x3e7dc000
> -       .long   0x3e8b4000
> -       .long   0x3e974000
> -       .long   0x3ea30000
> -       .long   0x3eae8000
> -       .long   0x3eb9c000
> -       .long   0x3ec4e000
> -       .long   0x3ecfa000
> -       .long   0x3eda2000
> -       .long   0x3ee48000
> -       .long   0x3eeea000
> -       .long   0x3ef8a000
> -       .long   0x3f013000
> -       .long   0x3f05f000
> -       .long   0x3f0aa000
> -       .long   0x3f0f4000
> -       .long   0x3f13d000
> -       .long   0x3f184000
> -       .long   0x3f1ca000
> -       .long   0x3f20f000
> -       .long   0x3f252000
> -       .long   0x3f295000
> -       .long   0x3f2d7000
> -       /* Log_tbl_L */
> +       /* Leave this at front so we can potentially save space due to
> +          smaller alignment constraint.  */
> +       .align  4
> +    /* AbsMask */
> +       .long   0x7fffffff
> +       .align  64
> +__svml_satanh_data_internal_avx512_al64:
> +       /* One */
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* AddB5 */
> +       .align  64
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       /* RcpBitMask */
> +       .align  64
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       /* Log_tbl_L_lo */
>         .align  64
>         .long   0x00000000
>         .long   0x3726c39e
> @@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
>         .long   0x38dedfac
>         .long   0x38ebfb5e
>         .long   0xb8e63c9f
> +       /* Log_tbl_L_hi */
> +       .align  64
>         .long   0xb85c1340
>         .long   0x38777bcd
>         .long   0xb6038656
> @@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
>         .long   0x38f85db0
>         .long   0x37b4996f
>         .long   0xb8bfb3ca
> -       /* One */
> +       /* Log_tbl_H_lo */
>         .align  64
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* AbsMask */
> +       .long   0x00000000
> +       .long   0x3cfc0000
> +       .long   0x3d780000
> +       .long   0x3db78000
> +       .long   0x3df10000
> +       .long   0x3e14c000
> +       .long   0x3e300000
> +       .long   0x3e4a8000
> +       .long   0x3e648000
> +       .long   0x3e7dc000
> +       .long   0x3e8b4000
> +       .long   0x3e974000
> +       .long   0x3ea30000
> +       .long   0x3eae8000
> +       .long   0x3eb9c000
> +       .long   0x3ec4e000
> +       /* Log_tbl_H_hi */
>         .align  64
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> -       /* AddB5 */
> +       .long   0x3ecfa000
> +       .long   0x3eda2000
> +       .long   0x3ee48000
> +       .long   0x3eeea000
> +       .long   0x3ef8a000
> +       .long   0x3f013000
> +       .long   0x3f05f000
> +       .long   0x3f0aa000
> +       .long   0x3f0f4000
> +       .long   0x3f13d000
> +       .long   0x3f184000
> +       .long   0x3f1ca000
> +       .long   0x3f20f000
> +       .long   0x3f252000
> +       .long   0x3f295000
> +       .long   0x3f2d7000
> +       /* L2H = log(2)_high */
>         .align  64
> -       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> -       /* RcpBitMask */
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       /* L2L = log(2)_low */
>         .align  64
> -       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
>         /* poly_coeff3 */
>         .align  64
> -       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
>         /* poly_coeff2 */
>         .align  64
> -       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
>         /* poly_coeff1 */
>         .align  64
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> -       /* poly_coeff0 */
> -       .align  64
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* Half */
> -       .align  64
> -       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> -       /* L2H = log(2)_high */
> -       .align  64
> -       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> -       /* L2L = log(2)_low */
> -       .align  64
> -       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
>         .align  64
> +       .type   __svml_satanh_data_internal_avx512_al64, @object
> +       .size   __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
>         .type   __svml_satanh_data_internal_avx512, @object
>         .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v4 2/7] x86: Improve svml_s_atanhf8_core_avx2.S
  2022-06-09 18:16   ` [PATCH v4 2/7] x86: Improve svml_s_atanhf8_core_avx2.S Noah Goldstein
@ 2022-06-09 19:34     ` H.J. Lu
  0 siblings, 0 replies; 48+ messages in thread
From: H.J. Lu @ 2022-06-09 19:34 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Jun 9, 2022 at 11:16 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Improvements are:
>     1. Reduce code size (-60 bytes).
>     2. Remove redundant move instructions.
>     3. Slightly improve instruction selection/scheduling where
>        possible.
>     4. Prefer registers which get short instruction encoding.
>     5. Shrink rodata usage (-32 bytes).
>
> The throughput improvement is not that significant (3-5%) as the
> port 0 bottleneck is unavoidable.
>
>        Function, New Time, Old Time, New / Old
> _ZGVdN8v_atanhf,    2.799,    2.923,     0.958
> ---
>  .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 405 +++++++++---------
>  1 file changed, 202 insertions(+), 203 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> index c1ea1c3353..43eb423831 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> @@ -30,305 +30,304 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> +/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
> +   by use in the function. On cold-starts this might hhelp the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
>  #define SgnMask                                0
>  #define sOne                           32
> -#define sPoly                          64
> -#define iBrkValue                      320
> -#define iOffExpoMask                   352
> -#define sHalf                          384
> -#define sSign                          416
> -#define sTopMask12                     448
> -#define TinyRange                      480
> -#define sLn2                           512
> +#define sTopMask12                     64
> +#define TinyRange                      96
> +#define iBrkValue                      128
> +#define iOffExpoMask                   160
> +#define sPoly                          192
> +#define sLn2                           448
> +#define sHalf                          480
>
>  #include <sysdep.h>
> +#define ATANHF_DATA(x)                 ((x)+__svml_satanh_data_internal)
>
>         .section .text.avx2, "ax", @progbits
>  ENTRY(_ZGVdN8v_atanhf_avx2)
> -       pushq   %rbp
> -       cfi_def_cfa_offset(16)
> -       movq    %rsp, %rbp
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
> -       andq    $-32, %rsp
> -       subq    $96, %rsp
> -
> +       /* Strip off the sign, so treat X as positive until right at the end */
> +       vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2
> +       vandps  %ymm2, %ymm0, %ymm3
>         /* Load constants including One = 1 */
> -       vmovups sOne+__svml_satanh_data_internal(%rip), %ymm5
> -       vmovups sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
> -       vmovaps %ymm0, %ymm6
> +       vmovups ATANHF_DATA(sOne)(%rip), %ymm5
> +       vsubps  %ymm3, %ymm5, %ymm1
> +       vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4
>
> -       /* Strip off the sign, so treat X as positive until right at the end */
> -       vandps  SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
> -       vsubps  %ymm10, %ymm5, %ymm1
> +       vrcpps  %ymm1, %ymm7
> +       vsubps  %ymm1, %ymm5, %ymm9
> +       vandps  %ymm4, %ymm7, %ymm6
> +       vsubps  %ymm3, %ymm9, %ymm7
>
> -       /*
> -        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> -        * the upper part UHi being <= 12 bits long. Then we have
> -        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> -        */
> -       vaddps  %ymm10, %ymm10, %ymm14
> +       /* No need to split sU when FMA is available */
> +       vfnmadd213ps %ymm5, %ymm6, %ymm1
> +       vmovaps %ymm0, %ymm8
> +       vfmadd213ps %ymm0, %ymm0, %ymm0
> +       vfnmadd231ps %ymm6, %ymm7, %ymm1
>
>         /*
>          * Check whether |X| < 1, in which case we use the main function.
>          * Otherwise set the rangemask so that the callout will get used.
>          * Note that this will also use the callout for NaNs since not(NaN < 1).
>          */
> -       vcmpnlt_uqps %ymm5, %ymm10, %ymm7
> -       vsubps  %ymm1, %ymm5, %ymm9
> -       vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
> -       vrcpps  %ymm1, %ymm11
> -       vsubps  %ymm10, %ymm9, %ymm12
> -       vandps  %ymm13, %ymm11, %ymm0
> +       vcmpnlt_uqps %ymm5, %ymm3, %ymm14
> +       vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
>
> -       /* No need to split sU when FMA is available */
> -       vfnmadd213ps %ymm5, %ymm0, %ymm1
> -       vmovaps %ymm6, %ymm8
> -       vfmadd213ps %ymm6, %ymm6, %ymm8
> -       vfnmadd231ps %ymm0, %ymm12, %ymm1
> +       /*
> +        * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> +        * the upper part UHi being <= 12 bits long. Then we have
> +        * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> +        */
> +       vaddps  %ymm3, %ymm3, %ymm3
>
>         /*
>          * Split V as well into upper 12 bits and lower part, so that we can get
>          * a preliminary quotient estimate without rounding error.
>          */
> -       vandps  %ymm13, %ymm14, %ymm15
> -       vmovmskps %ymm7, %edx
> -       vsubps  %ymm15, %ymm14, %ymm7
> +       vandps  %ymm4, %ymm3, %ymm4
> +       vsubps  %ymm4, %ymm3, %ymm7
>
>         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> -       vmulps  %ymm15, %ymm0, %ymm10
> +       vmulps  %ymm4, %ymm6, %ymm4
>
>         /* Compute D = E + E^2 */
>         vfmadd213ps %ymm1, %ymm1, %ymm1
>
> -       /* Record the sign for eventual reincorporation. */
> -       vandps  sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
> +       /* Record the sign for eventual reincorporation.  */
> +       vandnps %ymm8, %ymm2, %ymm3
>
>         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> -       vorps   %ymm3, %ymm8, %ymm2
> -       vmulps  %ymm7, %ymm0, %ymm8
> +       vorps   %ymm3, %ymm0, %ymm13
> +       vmulps  %ymm7, %ymm6, %ymm2
>
>         /*
>          * Compute R * (VHi + VLo) * (1 + E + E^2)
>          * = R *  (VHi + VLo) * (1 + D)
>          * = QHi + (QHi * D + QLo + QLo * D)
>          */
> -       vmulps  %ymm1, %ymm10, %ymm9
> -       vfmadd213ps %ymm8, %ymm8, %ymm1
> -       vaddps  %ymm1, %ymm9, %ymm1
>
> -       /* reduction: compute r, n */
> -       vmovups iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
> +       /*
> +        * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
> +        * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
> +        * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
> +        */
> +       vmulps  %ymm1, %ymm4, %ymm6
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
> +       vaddps  %ymm1, %ymm6, %ymm1
>
>         /*
>          * Now finally accumulate the high and low parts of the
>          * argument to log1p, H + L, with a final compensated summation.
>          */
> -       vaddps  %ymm1, %ymm10, %ymm12
> -       vsubps  %ymm12, %ymm10, %ymm11
> +       vaddps  %ymm1, %ymm4, %ymm2
> +
> +       /* reduction: compute r, n */
> +       vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9
>
>         /*
>          * Now we feed into the log1p code, using H in place of _VARG1 and
>          * later incorporating L into the reduced argument.
>          * compute 1+x as high, low parts
>          */
> -       vmaxps  %ymm12, %ymm5, %ymm13
> -       vminps  %ymm12, %ymm5, %ymm14
> -       vaddps  %ymm11, %ymm1, %ymm0
> -       vaddps  %ymm14, %ymm13, %ymm1
> -       vpsubd  %ymm9, %ymm1, %ymm7
> -       vsubps  %ymm1, %ymm13, %ymm15
> -       vpsrad  $23, %ymm7, %ymm10
> -       vpand   iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
> -       vaddps  %ymm15, %ymm14, %ymm13
> -       vpslld  $23, %ymm10, %ymm11
> -       vpaddd  %ymm9, %ymm8, %ymm15
> -       vaddps  %ymm13, %ymm0, %ymm14
> -       vcvtdq2ps %ymm10, %ymm0
> -       vpsubd  %ymm11, %ymm5, %ymm12
> +       vmaxps  %ymm2, %ymm5, %ymm0
> +       vminps  %ymm2, %ymm5, %ymm6
> +
> +       /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
> +       vsubps  %ymm2, %ymm4, %ymm2
> +       vaddps  %ymm6, %ymm0, %ymm4
> +       vpsubd  %ymm9, %ymm4, %ymm7
> +       vsubps  %ymm4, %ymm0, %ymm4
> +       vaddps  %ymm2, %ymm1, %ymm2
> +       vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
> +
> +       vandps  %ymm1, %ymm7, %ymm0
> +       vaddps  %ymm4, %ymm6, %ymm4
> +       vandnps %ymm7, %ymm1, %ymm6
> +       vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1
> +       vpaddd  %ymm9, %ymm0, %ymm0
> +       vaddps  %ymm4, %ymm2, %ymm4
> +       vpsubd  %ymm6, %ymm5, %ymm6
>
>         /* polynomial evaluation */
> -       vsubps  %ymm5, %ymm15, %ymm5
> -       vmulps  %ymm14, %ymm12, %ymm1
> -       vaddps  %ymm5, %ymm1, %ymm5
> -       vmovups sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
> -       vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -       vmulps  %ymm1, %ymm5, %ymm7
> -       vfmadd213ps %ymm5, %ymm5, %ymm7
> +       vsubps  %ymm5, %ymm0, %ymm2
> +       vfmadd231ps %ymm4, %ymm6, %ymm2
> +       vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
> +       vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
> +
> +       vmulps  %ymm1, %ymm2, %ymm1
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
>
>         /* final reconstruction */
> -       vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
> +       vpsrad  $23, %ymm7, %ymm6
> +       vcvtdq2ps %ymm6, %ymm2
> +       vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
>
>         /* Finally, halve the result and reincorporate the sign */
> -       vxorps  sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
> -       vmulps  %ymm0, %ymm3, %ymm0
> -       vblendvps %ymm4, %ymm2, %ymm0, %ymm0
> +       vxorps  ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
> +       vmulps  %ymm2, %ymm3, %ymm2
> +       vmovmskps %ymm14, %edx
>         testl   %edx, %edx
>
> +       vblendvps %ymm15, %ymm13, %ymm2, %ymm0
>         /* Go to special inputs processing branch */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
> -
> -       /* Restore registers
> -        * and exit the function
> -        */
> -
> -L(EXIT):
> -       movq    %rbp, %rsp
> -       popq    %rbp
> -       cfi_def_cfa(7, 8)
> -       cfi_restore(6)
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0
> +       /* No registers to restore on fast path.  */
>         ret
> -       cfi_def_cfa(6, 16)
> -       cfi_offset(6, -16)
>
> -       /* Branch to process
> -        * special inputs
> -        */
>
> +       /* Cold case. edx has 1s where there was a special value that
> +          needs to be handled by a atanhf call. Optimize for code size
> +          more so than speed here. */
>  L(SPECIAL_VALUES_BRANCH):
> -       vmovups %ymm6, 32(%rsp)
> -       vmovups %ymm0, 64(%rsp)
> -       # LOE rbx r12 r13 r14 r15 edx ymm0
> -
> -       xorl    %eax, %eax
> -       # LOE rbx r12 r13 r14 r15 eax edx
> -
> -       vzeroupper
> -       movq    %r12, 16(%rsp)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -       movl    %eax, %r12d
> -       movq    %r13, 8(%rsp)
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -       movl    %edx, %r13d
> -       movq    %r14, (%rsp)
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r15 r12d r13d
> -
> -       /* Range mask
> -        * bits check
> +       # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
> +    /* Use r13 to save/restore the stack. This allows us to use rbp as
> +       callee save register saving code size. */
> +       pushq   %r13
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf calls.
>          */
> +       pushq   %rbx
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbx, -24)
> +       pushq   %rbp
> +       cfi_adjust_cfa_offset(8)
> +       cfi_offset(rbp, -32)
> +       movq    %rsp, %r13
> +       cfi_def_cfa_register(r13)
>
> -L(RANGEMASK_CHECK):
> -       btl     %r12d, %r13d
> +       /* Align stack and make room for 2x ymm vectors.  */
> +       andq    $-32, %rsp
> +       addq    $-64, %rsp
>
> -       /* Call scalar math function */
> -       jc      L(SCALAR_MATH_CALL)
> -       # LOE rbx r15 r12d r13d
> +       /* Save all already computed inputs.  */
> +       vmovups %ymm0, (%rsp)
> +       /* Save original input (ymm8 unchanged up to this point).  */
> +       vmovups %ymm8, 32(%rsp)
>
> -       /* Special inputs
> -        * processing loop
> -        */
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a atanhf call.  */
> +       movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       incl    %r12d
> -       cmpl    $8, %r12d
> -
> -       /* Check bits in range mask */
> -       jl      L(RANGEMASK_CHECK)
> -       # LOE rbx r15 r12d r13d
> -
> -       movq    16(%rsp), %r12
> -       cfi_restore(12)
> -       movq    8(%rsp), %r13
> -       cfi_restore(13)
> -       movq    (%rsp), %r14
> -       cfi_restore(14)
> -       vmovups 64(%rsp), %ymm0
> -
> -       /* Go to exit */
> -       jmp     L(EXIT)
> -       /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -       /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -       .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -       # LOE rbx r12 r13 r14 r15 ymm0
> -
> -       /* Scalar math fucntion call
> -        * to process special input
> -        */
> -
> -L(SCALAR_MATH_CALL):
> -       movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       # LOE rbx rbp r12 r13 r14 r15
> +       /* use rbp as index for special value that is saved across calls to
> +          atanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop. Realigning also costs more code size.  */
> +       xorl    %ebp, %ebp
> +       tzcntl  %ebx, %ebp
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   32(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
> -       # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %rbp, 4)
> +
> +       blsrl   %ebx, %ebx
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +       # LOE r12 r13 r14 r15
> +
>
> -       /* Process special inputs in loop */
> -       jmp     L(SPECIAL_VALUES_LOOP)
> -       # LOE rbx r15 r12d r13d
> +       /* All results have been written to (%rsp).  */
> +       vmovups (%rsp), %ymm0
> +       /* Restore rsp.  */
> +       movq    %r13, %rsp
> +       cfi_def_cfa_register(rsp)
> +       /* Restore callee save registers.  */
> +       popq    %rbp
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %rbx
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(rbp)
> +       popq    %r13
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_restore(r13)
> +       ret
>  END(_ZGVdN8v_atanhf_avx2)
>
>         .section .rodata, "a"
>         .align  32
> -
>  #ifdef __svml_satanh_data_internal_typedef
>  typedef unsigned int VUINT32;
> -typedef struct {
> +typedef struct{
>         __declspec(align(32)) VUINT32 SgnMask[8][1];
>         __declspec(align(32)) VUINT32 sOne[8][1];
> -       __declspec(align(32)) VUINT32 sPoly[8][8][1];
> -       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> -       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> -       __declspec(align(32)) VUINT32 sHalf[8][1];
> -       __declspec(align(32)) VUINT32 sSign[8][1];
>         __declspec(align(32)) VUINT32 sTopMask12[8][1];
>         __declspec(align(32)) VUINT32 TinyRange[8][1];
> +       __declspec(align(32)) VUINT32 iBrkValue[8][1];
> +       __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> +       __declspec(align(32)) VUINT32 sPoly[8][8][1];
>         __declspec(align(32)) VUINT32 sLn2[8][1];
> +       __declspec(align(32)) VUINT32 sHalf[8][1];
>  } __svml_satanh_data_internal;
>  #endif
>  __svml_satanh_data_internal:
>         /* SgnMask */
> -       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
>         /* sOne = SP 1.0 */
>         .align  32
> -       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -       /* sPoly[] = SP polynomial */
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* sTopMask12 */
> +       .align  32
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       /* TinyRange */
>         .align  32
> -       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> -       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> -       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> -       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> -       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> -       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> -       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> -       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
>         /* iBrkValue = SP 2/3 */
>         .align  32
> -       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
>         /* iOffExpoMask = SP significand mask */
>         .align  32
> -       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> -       /* sHalf */
> -       .align  32
> -       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> -       /* sSign */
> -       .align  32
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       /* sTopMask12 */
> -       .align  32
> -       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> -       /* TinyRange */
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       /* sPoly[] = SP polynomial */
>         .align  32
> -       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
>         /* sLn2 = SP ln(2) */
>         .align  32
> -       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       /* sHalf */
> +       .align  32
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
>         .align  32
>         .type   __svml_satanh_data_internal, @object
>         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2022-06-09 19:35 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-07 20:06 [PATCH v1 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
2022-06-07 20:06 ` [PATCH v1 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
2022-06-07 20:06 ` [PATCH v1 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
2022-06-07 20:06 ` [PATCH v1 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
2022-06-07 20:06 ` [PATCH v1 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
2022-06-07 20:06 ` [PATCH v1 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
2022-06-07 20:06 ` [PATCH v1 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
2022-06-08  2:42   ` H.J. Lu
2022-06-08  3:07   ` H.J. Lu
2022-06-09  0:06     ` Noah Goldstein
2022-06-09  0:05 ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S Noah Goldstein
2022-06-09  0:05   ` [PATCH v2 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
2022-06-09 16:01     ` H.J. Lu
2022-06-09 16:56       ` Noah Goldstein
2022-06-09  0:05   ` [PATCH v2 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
2022-06-09 16:03     ` H.J. Lu
2022-06-09 16:56       ` Noah Goldstein
2022-06-09  0:05   ` [PATCH v2 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
2022-06-09 16:04     ` H.J. Lu
2022-06-09 16:57       ` Noah Goldstein
2022-06-09  0:05   ` [PATCH v2 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
2022-06-09 16:05     ` H.J. Lu
2022-06-09  0:05   ` [PATCH v2 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
2022-06-09 16:10     ` H.J. Lu
2022-06-09 16:58       ` Noah Goldstein
2022-06-09  0:05   ` [PATCH v2 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
2022-06-09 15:59   ` [PATCH v2 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
2022-06-09 16:56     ` Noah Goldstein
2022-06-09 16:57       ` H.J. Lu
2022-06-09 16:58 ` [PATCH v3 " Noah Goldstein
2022-06-09 16:58   ` [PATCH v3 2/7] x86: Improvement svml_s_atanhf8_core_avx2.S Noah Goldstein
2022-06-09 17:05     ` H.J. Lu
2022-06-09 16:58   ` [PATCH v3 3/7] x86: Improve svml_s_atanhf4_core_sse4.S Noah Goldstein
2022-06-09 17:07     ` H.J. Lu
2022-06-09 16:58   ` [PATCH v3 4/7] x86: Optimize svml_s_tanhf16_core_avx512.S Noah Goldstein
2022-06-09 17:07     ` H.J. Lu
2022-06-09 16:58   ` [PATCH v3 5/7] x86: Add data file that can be shared by tanhf-avx2 and tanhf-sse4 Noah Goldstein
2022-06-09 17:11     ` H.J. Lu
2022-06-09 16:58   ` [PATCH v3 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
2022-06-09 17:09     ` H.J. Lu
2022-06-09 16:58   ` [PATCH v3 7/7] x86: Optimize svml_s_tanhf4_core_sse4.S Noah Goldstein
2022-06-09 17:10     ` H.J. Lu
2022-06-09 17:04   ` [PATCH v3 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu
2022-06-09 18:16 ` [PATCH v4 " Noah Goldstein
2022-06-09 18:16   ` [PATCH v4 2/7] x86: Improve svml_s_atanhf8_core_avx2.S Noah Goldstein
2022-06-09 19:34     ` H.J. Lu
2022-06-09 18:16   ` [PATCH v4 6/7] x86: Optimize svml_s_tanhf8_core_avx2.S Noah Goldstein
2022-06-09 19:33   ` [PATCH v4 1/7] x86: Improve svml_s_atanhf16_core_avx512.S H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).