* [RFC][PATCH 1/2] aarch64: Add optimized ASIMD version of sinf
@ 2017-06-13 7:17 Ashwin Sekhar T K
0 siblings, 0 replies; only message in thread
From: Ashwin Sekhar T K @ 2017-06-13 7:17 UTC (permalink / raw)
To: libc-alpha; +Cc: Ashwin Sekhar T K
This patch adds the optimized ASIMD version of sinf for Aarch64.
The algorithm and code flow is based on the SSE version of sinf
implementation for x86_64 in sysdeps/x86_64/fpu/s_sinf.S.
* sysdeps/aarch64/fpu/multiarch/Makefile: New file.
* sysdeps/aarch64/fpu/multiarch/s_sinf.c: Likewise.
* sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S: Likewise.
---
sysdeps/aarch64/fpu/multiarch/Makefile | 3 +
sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S | 382 +++++++++++++++++++++++++++
sysdeps/aarch64/fpu/multiarch/s_sinf.c | 31 +++
3 files changed, 416 insertions(+)
create mode 100644 sysdeps/aarch64/fpu/multiarch/Makefile
create mode 100644 sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S
create mode 100644 sysdeps/aarch64/fpu/multiarch/s_sinf.c
diff --git a/sysdeps/aarch64/fpu/multiarch/Makefile b/sysdeps/aarch64/fpu/multiarch/Makefile
new file mode 100644
index 0000000000..2092e9a885
--- /dev/null
+++ b/sysdeps/aarch64/fpu/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_sinf-asimd
+endif
diff --git a/sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S b/sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S
new file mode 100644
index 0000000000..83f26c0c33
--- /dev/null
+++ b/sysdeps/aarch64/fpu/multiarch/s_sinf-asimd.S
@@ -0,0 +1,382 @@
+/* Optimized ASIMD version of sinf
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ * 1) if |x| == 0: return x.
+ * 2) if |x| < 2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ * 3) if |x| < 2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ * 4) if |x| < Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ * 5) if |x| < 9*Pi/4:
+ * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 5.2) Reconstruction:
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * if(n&2 != 0) {
+ * using cos(t) polynomial for |t|<Pi/4, result is
+ * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ * } else {
+ * using sin(t) polynomial for |t|<Pi/4, result is
+ * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ * }
+ * 6) if |x| < 2^23, large args:
+ * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 6.2) Reconstruction same as (5.2).
+ * 7) if |x| >= 2^23, very large args:
+ * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 7.2) Reconstruction same as (5.2).
+ * 8) if x is Inf, return x-x, and set errno=EDOM.
+ * 9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ * sin(+-0) = +-0 not raising inexact/underflow,
+ * sin(subnormal) raises inexact/underflow,
+ * sin(min_normalized) raises inexact/underflow,
+ * sin(normalized) raises inexact,
+ * sin(Inf) = NaN, raises invalid, sets errno to EDOM,
+ * sin(NaN) = NaN.
+ */
+
+sp_x .req s0 /* SP x */
+dp_x .req d1 /* DP x */
+sp_abs_x .req s2 /* SP |x| */
+dp_abs_x .req d3 /* DP |x| */
+t_val .req d4 /* DP t */
+dp_pio4 .req d5 /* DP Pi/4 */
+dp_zero .req d6 /* DP 0.0 */
+dp_one .req d7 /* DP 1.0 */
+
+bits_abs_x .req w0 /* Bits of SP |x| */
+sign_x .req w1 /* Sign bit of x */
+n_val .req w2 /* n */
+bits_x .req w3 /* Bits of SP x */
+
+ENTRY_ALIGN(__sinf_asimd, 6)
+ /* Input: single precision x in s0 */
+ ldr w9, L(SP_PIO4) /* Pi/4 */
+ fmov bits_x, sp_x /* Bits of x */
+ ubfx bits_abs_x, bits_x, #0, #31 /* Bits of |x| */
+ cmp bits_abs_x, w9 /* |x|<Pi/4? */
+ blt L(arg_less_pio4)
+
+ /* Here if |x|>=Pi/4 */
+ ldr w10, L(SP_9PIO4) /* 9*Pi/4 */
+ fmov sp_abs_x, bits_abs_x /* SP |x| */
+ cmp bits_abs_x, w10 /* |x|>9*Pi/4? */
+ bge L(large_args)
+
+ /* Here if Pi/4<=|x|<9*Pi/4 */
+ ldr s16, L(SP_INVPIO4) /* SP 1/(Pi/4) */
+ fcvt dp_abs_x, sp_abs_x /* DP |x| */
+ ldr dp_pio4, L(DP_PIO4)
+ fmul s16, sp_abs_x, s16 /* SP |x|/(Pi/4) */
+ fcvtzu w12, s16 /* k=trunc(|x|/(Pi/4)) */
+ add n_val, w12, #1 /* n=k+1*/
+ and w12, n_val, #0x0e /* j=n&0x0e */
+ ucvtf d16, w12 /* DP j */
+ fmsub t_val, d16, dp_pio4, dp_abs_x /* t=|x|-j*Pi/4 */
+
+ .p2align 3
+L(reconstruction):
+ /* Input: w2=n, d4=t, w1=sign_x */
+ tst n_val, #2 /* n&2? */
+ fmov dp_one, #1.0 /* DP 1.0 */
+ adr x14, L(DP_C) /* Cos Poly Coefficients */
+ adr x15, L(DP_S) /* Sin Poly Coefficients */
+ fcsel d17, t_val, dp_one, EQ /* q=t or 1.0 */
+ lsr sign_x, bits_x, #31 /* Sign bit of x */
+ eor w9, sign_x, n_val, LSR #2 /* (n>>2) XOR sign(x) */
+ lsl x9, x9, #63 /* sign_sin */
+ fmov d20, x9 /* sign_sin */
+ csel x14, x15, x14, EQ /* K=Sin or Cos Coefficients */
+ eor v17.8b, v17.8b, v20.8b /* r=sign_sin XOR (1.0 or t) */
+
+ .p2align 3
+L(sin_cos_poly):
+ /*
+ * Here if sin(x) is evalutaed by sin(t)/cos(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = r * (1.0+t^2*(K0+t^2*(K1+t^2*(K2+t^2*(K3+t^2*K4)))))
+ * where r=s*t, Kx=Sx (Sin Polynomial Coefficients) if n&2==0
+ * r=s, Kx=Cx (Cos Polynomial Coefficients) otherwise
+ */
+ fmul d18, t_val, t_val /* y=t^2 */
+ fmul d19, d18, d18 /* z=t^4 */
+ ldr d21, [x14, #0*8] /* K0 */
+ ldp d22, d23, [x14, #1*8] /* K1,K2 */
+ ldp d24, d25, [x14, #3*8] /* K3,K4 */
+ fmadd d23, d25, d19, d23 /* K2+z*K4 */
+ fmadd d22, d24, d19, d22 /* K1+z*K3 */
+ fmadd d21, d23, d19, d21 /* K0+z*(K2+z*K4) */
+ fmul d22, d22, d19 /* z*(K1+z*K3) */
+ /* y*(K0+y*(K1+y*(K2+y*(K3+y*K4)))) */
+ fmadd d22, d21, d18, d22
+ fmadd d22, d22, d17, d17 /* DP result */
+ fcvt s0, d22 /* SP result */
+ ret
+
+ .p2align 3
+L(large_args):
+ /* Here if |x|>=9*Pi/4 */
+ mov w8, #0x7f8 /* InfNan>>20 */
+ cmp bits_abs_x, w8, LSL #20 /* x is Inf or NaN? */
+ bge L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=9*Pi/4 */
+ fcvt dp_abs_x, sp_abs_x /* DP |x| */
+ fmov dp_one, #1.0 /* DP 1.0 */
+ mov w11, #0x4b0 /* 2^23>>20 */
+ cmp bits_abs_x, w11, LSL #20 /* |x|>=2^23? */
+ bge L(very_large_args)
+
+ /* Here if 9*Pi/4<=|x|<2^23 */
+ adr x14, L(DP_PIO4HILO)
+ ldr d16, L(DP_INVPIO4) /* 1/(Pi/4) */
+ ldp d17, d18, [x14] /* -PIO4HI,-PIO4LO */
+ fmadd d16, dp_abs_x, d16, dp_one /* |x|/(Pi/4)+1.0 */
+ fcvtzu n_val, d16 /* n=trunc(|x|/(Pi/4)+1.0) */
+ and w10, n_val, #0xfffffffe /* j=n&0xfffffffe */
+ ucvtf d16, w10 /* DP j */
+ fmadd d17, d16, d17, dp_abs_x /* |x|-j*PIO4HI */
+ fmadd t_val, d16, d18, d17 /* t=|x|-j*PIO4HI-j*PIO4LO */
+ b L(reconstruction)
+
+L(very_large_args):
+ /* Here if finite |x|>=2^23 */
+ movz x11, #0x4330, LSL #48 /* 2^52 */
+ fmov d21, x11 /* DP 2^52 */
+ ldr dp_pio4, L(DP_PIO4) /* Pi/4 */
+ fmov dp_zero, xzr /* 0.0 */
+ adr x14, L(_FPI)
+ lsr w8, bits_abs_x, #23 /* eb = biased exponent of x */
+ add w8, w8, #-0x7f+59 /* bitpos=eb-BIAS_32+59 */
+ mov w9, #28 /* =28 */
+ udiv w10, w8, w9 /* j=bitpos/28 */
+ mov x11, #0xffffffff00000000 /* DP_HI_MASK */
+ add x14, x14, x10, LSL #3
+ ldr d16, [x14, #-2*8] /* FPI[j-2] */
+ ldr d17, [x14, #-1*8] /* FPI[j-1] */
+ ldr q18, [x14] /* FPI[j+1]|FPI[j] */
+ mul w10, w10, w9 /* j*28 */
+ add w10, w10, #19 /* j*28+19 */
+ fmov d20, x11 /* DP_HI_MASK */
+ fmul d16, dp_abs_x, d16 /* tmp3 */
+ fmul d17, dp_abs_x, d17 /* tmp2 */
+ fmul v18.2d, v18.2d, v3.d[0] /* tmp1|tmp0 */
+ cmp w8, w10 /* bitpos>=j*28+19? */
+ and v20.8b, v16.8b, v20.8b /* HI(tmp3) */
+ fcsel d20, dp_zero, d20, LT /* d=HI(tmp3) OR 0.0 */
+ fsub d16, d16, d20 /* tmp3=tmp3-d */
+ fadd d22, d16, d17 /* tmp5=tmp3+tmp2 */
+ fadd d20, d22, d21 /* tmp6=tmp5+2^52 */
+ fsub d21, d20, d21 /* tmp4=tmp6-2^52 */
+ faddp d18, v18.2d /* tmp7=tmp0+tmp1 */
+ fmov w10, s20 /* k=I64_LO(tmp6) */
+ fcmp d21, d22 /* tmp4>tmp5? */
+ cset w9, GT /* c=1 or 0 */
+ fcsel d22, dp_one, dp_zero, GT /* d=1.0 or 0.0 */
+ sub w10, w10, w9 /* k-=c */
+ fsub d21, d21, d22 /* tmp4-=d */
+ and w11, w10, #1 /* k&1 */
+ ucvtf d20, w11 /* DP k&1 */
+ fsub d16, d16, d21 /* tmp3-=tmp4 */
+ fmsub d20, d20, dp_one, d16 /* t=-1.0*[k&1]+tmp3 */
+ fadd d20, d20, d17 /* t+=tmp2 */
+ add n_val, w10, #1 /* n=k+1 */
+ fadd d20, d20, d18 /* t+=tmp7 */
+ fmul t_val, d20, dp_pio4 /* t*=PI/4 */
+ b L(reconstruction)
+
+ .p2align 3
+L(arg_less_pio4):
+ /* Here if |x|<Pi/4 */
+ fcvt dp_x, sp_x /* DP x */
+ mov w10, #0x3d0 /* 2^-5>>20 */
+ cmp bits_abs_x, w10, LSL #20 /* |x|<2^-5? */
+ blt L(arg_less_2pn5)
+
+ /* Here if 2^-5<=|x|<Pi/4 */
+ fmov t_val, dp_x /* DP t=x */
+ adr x14, L(DP_S) /* Kx=Sx */
+ fmov d17, dp_x /* r=x */
+ b L(sin_cos_poly)
+
+L(arg_less_2pn5):
+ /* Here if |x|<2^-5 */
+ mov w11, #0x320 /* 2^-27>>20 */
+ cmp bits_abs_x, w11, LSL #20 /* |x|<2^-27? */
+ blt L(arg_less_2pn27)
+
+ /* Here if 2^-27<=|x|<2^-5 */
+ adr x14, L(DP_SIN2)
+ ldp d16, d17, [x14] /* DP SIN2_0,SIN2_1 */
+ fmul d18, dp_x, dp_x /* y=x^2 */
+ fmadd d16, d17, d18, d16 /* DP SIN2_0+x^2*SIN2_1 */
+ fmul d16, d16, d18 /* DP x^2*SIN2_0+x^4*SIN2_1 */
+ fmadd d16, d16, dp_x, dp_x /* DP result */
+ fcvt s0, d16 /* SP result */
+ ret
+
+L(arg_less_2pn27):
+ /* Here if |x|<2^-27 */
+ cbnz bits_abs_x, L(arg_not_zero)
+ /* Here if |x|==0 */
+ ret
+
+L(arg_not_zero):
+ /* Here if 0<|x|<2^-27 */
+ /*
+ * Special cases here:
+ * sin(subnormal) raises inexact/underflow
+ * sin(min_normalized) raises inexact/underflow
+ * sin(normalized) raises inexact
+ */
+ movz x8, #0x3cd0, LSL #48 /* DP_SMALL=2^-50 */
+ fmov d16, x8 /* DP_SMALL */
+ fmsub dp_x, dp_x, d16, dp_x /* Result is x-x*DP_SMALL */
+ fcvt s0, dp_x
+ ret
+
+ .p2align 3
+L(arg_inf_or_nan):
+ /* Here if |x| is Inf or NAN */
+ bne L(skip_errno_setting) /* in case of x is NaN */
+
+ /* Here if x is Inf. Set errno to EDOM. */
+ adrp x14, :gottprel: errno
+ ldr PTR_REG(14), [x14, #:gottprel_lo12:errno]
+ mrs x15, tpidr_el0
+ mov w8, #EDOM /* EDOM */
+ str w8, [x15, x14] /* Store EDOM in errno */
+
+L(skip_errno_setting):
+ /* Here if |x| is Inf or NAN. Continued. */
+ fsub s0, sp_x, sp_x /* Result is NaN */
+ ret
+
+END(__sinf_asimd)
+
+ .section .rodata, "a"
+ .p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+ .long 0x00000000,0x00000000
+ .long 0x6c000000,0x3ff45f30
+ .long 0x2a000000,0x3e3c9c88
+ .long 0xa8000000,0x3c54fe13
+ .long 0xd0000000,0x3aaf47d4
+ .long 0x6c000000,0x38fbb81b
+ .long 0xe0000000,0x3714acc9
+ .long 0x7c000000,0x3560e410
+ .long 0x56000000,0x33bca2c7
+ .long 0xac000000,0x31fbd778
+ .long 0xe0000000,0x300b7246
+ .long 0xe8000000,0x2e5d2126
+ .long 0x48000000,0x2c970032
+ .long 0xe8000000,0x2ad77504
+ .long 0xe0000000,0x290921cf
+ .long 0xb0000000,0x274deb1c
+ .long 0xe0000000,0x25829a73
+ .long 0xbe000000,0x23fd1046
+ .long 0x10000000,0x2224baed
+ .long 0x8e000000,0x20709d33
+ .long 0x80000000,0x1e535a2f
+ .long 0x64000000,0x1cef904e
+ .long 0x30000000,0x1b0d6398
+ .long 0x24000000,0x1964ce7d
+ .long 0x16000000,0x17b908bf
+ .type L(_FPI), @object
+ ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+ for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5. */
+ .p2align 3
+L(DP_SIN2):
+ .long 0x5543d49d,0xbfc55555 /* DP_SIN2_0 */
+ .long 0x75cec8c5,0x3f8110f4 /* DP_SIN2_1 */
+ .type L(DP_SIN2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SIN2))
+
+/* Coefficients of polynomial
+ for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_S):
+ .long 0x55551cd9,0xbfc55555 /* S0 */
+ .long 0x10c2688b,0x3f811111 /* S1 */
+ .long 0x8b4bd1f9,0xbf2a019f /* S2 */
+ .long 0x64e6b5b4,0x3ec71d72 /* S3 */
+ .long 0x1674b58a,0xbe5a947e /* S4 */
+ .type L(DP_S), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S))
+
+ .p2align 3
+/* Coefficients of polynomial
+ for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */
+L(DP_C):
+ .long 0xfffe98ae,0xbfdfffff /* C0 */
+ .long 0x545c50c7,0x3fa55555 /* C1 */
+ .long 0x348b6874,0xbf56c16b /* C2 */
+ .long 0x9ac43cc0,0x3efa00eb /* C3 */
+ .long 0xdd8844d7,0xbe923c97 /* C4 */
+ .type L(DP_C4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C))
+
+ .p2align 3
+L(DP_PIO4):
+ .long 0x54442d18,0x3fe921fb /* Pi/4 */
+ .type L(DP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+ .p2align 3
+L(DP_INVPIO4):
+ .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
+ .type L(DP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+ .p2align 3
+L(DP_PIO4HILO):
+ .long 0x54000000,0xbfe921fb /* High part of Pi/4 */
+ .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
+ .type L(DP_PIO4HILO), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4HILO))
+
+ .p2align 2
+L(SP_PIO4):
+ .long 0x3f490fdb /* Pi/4 */
+ .type L(SP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_PIO4))
+
+ .p2align 2
+L(SP_9PIO4):
+ .long 0x40e231d6 /* 9*Pi/4 */
+ .type L(SP_9PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_9PIO4))
+
+ .p2align 2
+L(SP_INVPIO4):
+ .long 0x3fa2f983 /* 4/Pi */
+ .type L(SP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+weak_alias (__sinf, sinf)
diff --git a/sysdeps/aarch64/fpu/multiarch/s_sinf.c b/sysdeps/aarch64/fpu/multiarch/s_sinf.c
new file mode 100644
index 0000000000..78fdc9e5d0
--- /dev/null
+++ b/sysdeps/aarch64/fpu/multiarch/s_sinf.c
@@ -0,0 +1,31 @@
+/* Multiple versions of sinf. AARCH64 Version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+#include <sys/auxv.h>
+
+extern float __sinf_asimd (float);
+extern float __sinf_aarch64 (float);
+float __sinf (float);
+
+libm_ifunc (__sinf,
+ (GLRO (dl_hwcap) & HWCAP_ASIMD) ? __sinf_asimd : __sinf_aarch64);
+weak_alias (__sinf, sinf);
+
+#define SINF __sinf_aarch64
+#include <sysdeps/ieee754/flt-32/s_sinf.c>
--
2.12.2
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2017-06-13 7:17 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-13 7:17 [RFC][PATCH 1/2] aarch64: Add optimized ASIMD version of sinf Ashwin Sekhar T K
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).