* [PATCH 1/6] aarch64: Add vector implementations of asin routines
@ 2023-11-03 12:12 Joe Ramsay
2023-11-03 12:12 ` [PATCH 2/6] aarch64: Add vector implementations of acos routines Joe Ramsay
` (5 more replies)
0 siblings, 6 replies; 14+ messages in thread
From: Joe Ramsay @ 2023-11-03 12:12 UTC (permalink / raw)
To: libc-alpha; +Cc: Joe Ramsay
---
Thanks,
Joe
sysdeps/aarch64/fpu/Makefile | 3 +-
sysdeps/aarch64/fpu/Versions | 4 +
sysdeps/aarch64/fpu/asin_advsimd.c | 113 ++++++++++++++++++
sysdeps/aarch64/fpu/asin_sve.c | 86 +++++++++++++
sysdeps/aarch64/fpu/asinf_advsimd.c | 104 ++++++++++++++++
sysdeps/aarch64/fpu/asinf_sve.c | 78 ++++++++++++
sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
.../fpu/test-double-advsimd-wrappers.c | 1 +
.../aarch64/fpu/test-double-sve-wrappers.c | 1 +
.../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
sysdeps/aarch64/libm-test-ulps | 8 ++
.../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
13 files changed, 407 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/aarch64/fpu/asin_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/asin_sve.c
create mode 100644 sysdeps/aarch64/fpu/asinf_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/asinf_sve.c
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 1f1ac2a2b8..d7c0bd2ed5 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,4 +1,5 @@
-libmvec-supported-funcs = cos \
+libmvec-supported-funcs = asin \
+ cos \
exp \
exp10 \
exp2 \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index eb5ad50017..0f365a1e2e 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -18,6 +18,10 @@ libmvec {
_ZGVsMxv_sinf;
}
GLIBC_2.39 {
+ _ZGVnN4v_asinf;
+ _ZGVnN2v_asin;
+ _ZGVsMxv_asinf;
+ _ZGVsMxv_asin;
_ZGVnN4v_exp10f;
_ZGVnN2v_exp10;
_ZGVsMxv_exp10f;
diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c
new file mode 100644
index 0000000000..d2adbc0d87
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asin_advsimd.c
@@ -0,0 +1,113 @@
+/* Double-precision AdvSIMD inverse sin
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+ float64x2_t poly[12];
+ float64x2_t pi_over_2;
+ uint64x2_t abs_mask;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+ V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+ V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+ V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+ V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+ V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define One (0x3ff0000000000000)
+#define Small (0x3e50000000000000) /* 2^-12. */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (asin, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector asin(x).
+
+ For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+ rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+ following approximation.
+
+ For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 1.01 ulps,
+ _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
+ want 0x1.ed78525a927eep-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.69 ulps,
+ _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
+ want 0x1.110d7e85fdd53p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* Special values need to be computed with scalar fallbacks so
+ that appropriate exceptions are raised. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+ v_u64 (One - Small));
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x, x, AllMask);
+#endif
+
+ uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x),
+ vfmsq_n_f64 (v_f64 (0.5), ax, 0.5));
+ float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float64x2_t z4 = vmulq_f64 (z2, z2);
+ float64x2_t z8 = vmulq_f64 (z4, z4);
+ float64x2_t z16 = vmulq_f64 (z8, z8);
+ float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0));
+
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
+}
diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c
new file mode 100644
index 0000000000..fa04d7fca6
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asin_sve.c
@@ -0,0 +1,86 @@
+/* Double-precision SVE inverse sin
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ float64_t poly[12];
+ float64_t pi_over_2f;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4,
+ 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6,
+ 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
+ 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7,
+ 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6,
+ -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
+ .pi_over_2f = 0x1.921fb54442d18p+0,
+};
+
+#define P(i) sv_f64 (d->poly[i])
+
+/* Double-precision SVE implementation of vector asin(x).
+
+ For |x| in [0, 0.5], use an order 11 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.52 ulps,
+ _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2
+ want 0x1.ec13757305f26p-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.69 ulps,
+ _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
+ want 0x1.110d7e85fdd53p-1. */
+svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+ svfloat64_t ax = svabs_x (pg, x);
+ svbool_t a_ge_half = svacge (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ svfloat64_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat64_t z = svsqrt_m (ax, a_ge_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat64_t z4 = svmul_x (pg, z2, z2);
+ svfloat64_t z8 = svmul_x (pg, z4, z4);
+ svfloat64_t z16 = svmul_x (pg, z8, z8);
+ svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f);
+
+ /* Copy sign. */
+ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
+}
diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c
new file mode 100644
index 0000000000..3180ae7c8e
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
@@ -0,0 +1,104 @@
+/* Single-precision AdvSIMD inverse sin
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t pi_over_2f;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+ V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+ .pi_over_2f = V4 (0x1.921fb6p+0f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x39800000 /* 2^-12. */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (asinf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector asin(x).
+
+ For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+ rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+ following approximation.
+
+ For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.83 ulps,
+ _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.41 ulps,
+ _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+ /* Special values need to be computed with scalar fallbacks so
+ that appropriate fp exceptions are raised. */
+ uint32x4_t special
+ = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+ float32x4_t ax = vreinterpretq_f32_u32 (ia);
+ uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
+ vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+ float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float32x4_t p = v_horner_4_f32 (z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ float32x4_t y
+ = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
+
+ /* Copy sign. */
+ return vbslq_f32 (v_u32 (AbsMask), y, x);
+}
diff --git a/sysdeps/aarch64/fpu/asinf_sve.c b/sysdeps/aarch64/fpu/asinf_sve.c
new file mode 100644
index 0000000000..5abe710b5a
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinf_sve.c
@@ -0,0 +1,78 @@
+/* Single-precision SVE inverse sin
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float32_t poly[5];
+ float32_t pi_over_2f;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6,
+ 0x1.3af7d8p-5, },
+ .pi_over_2f = 0x1.921fb6p+0f,
+};
+
+/* Single-precision SVE implementation of vector asin(x).
+
+ For |x| in [0, 0.5], use order 4 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.83 ulps,
+ _ZGVsMxv_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2
+ want 0x1.fef15cp-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.41 ulps,
+ _ZGVsMxv_asinf (-0x1.00203ep-1) got -0x1.0c3a64p-1
+ want -0x1.0c3a6p-1. */
+svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svbool_t a_ge_half = svacge (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ svfloat32_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat32_t z = svsqrt_m (ax, a_ge_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ svfloat32_t y = svmad_m (a_ge_half, p, sv_f32 (-2.0), d->pi_over_2f);
+
+ /* Copy sign. */
+ return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
+}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 06587ffa91..03778faf96 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -49,6 +49,7 @@ typedef __SVBool_t __sv_bool_t;
# define __vpcs __attribute__ ((__aarch64_vector_pcs__))
+__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
@@ -59,6 +60,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
@@ -74,6 +76,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
#ifdef __SVE_VEC_MATH_SUPPORTED
+__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
@@ -84,6 +87,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 26d5ecf66f..b5ccd6b1cc 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -23,6 +23,7 @@
#define VEC_TYPE float64x2_t
+VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 86efd60779..fc3b20f421 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -32,6 +32,7 @@
return svlastb_f64 (svptrue_b64 (), mr); \
}
+SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 8f7ebea1ac..0a36aa91f5 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -23,6 +23,7 @@
#define VEC_TYPE float32x4_t
+VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 885e58ac39..f7e4882c7a 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -32,6 +32,7 @@
return svlastb_f32 (svptrue_b32 (), mr); \
}
+SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index d117209c06..1edc0fc343 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -46,11 +46,19 @@ double: 1
float: 1
ldouble: 1
+Function: "asin_advsimd":
+double: 2
+float: 2
+
Function: "asin_downward":
double: 1
float: 1
ldouble: 2
+Function: "asin_sve":
+double: 2
+float: 2
+
Function: "asin_towardzero":
double: 1
float: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index cad774521a..6431c3fe65 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -14,16 +14,20 @@ GLIBC_2.38 _ZGVsMxv_log F
GLIBC_2.38 _ZGVsMxv_logf F
GLIBC_2.38 _ZGVsMxv_sin F
GLIBC_2.38 _ZGVsMxv_sinf F
+GLIBC_2.39 _ZGVnN2v_asin F
GLIBC_2.39 _ZGVnN2v_exp10 F
GLIBC_2.39 _ZGVnN2v_exp2 F
GLIBC_2.39 _ZGVnN2v_log10 F
GLIBC_2.39 _ZGVnN2v_log2 F
GLIBC_2.39 _ZGVnN2v_tan F
+GLIBC_2.39 _ZGVnN4v_asinf F
GLIBC_2.39 _ZGVnN4v_exp10f F
GLIBC_2.39 _ZGVnN4v_exp2f F
GLIBC_2.39 _ZGVnN4v_log10f F
GLIBC_2.39 _ZGVnN4v_log2f F
GLIBC_2.39 _ZGVnN4v_tanf F
+GLIBC_2.39 _ZGVsMxv_asin F
+GLIBC_2.39 _ZGVsMxv_asinf F
GLIBC_2.39 _ZGVsMxv_exp10 F
GLIBC_2.39 _ZGVsMxv_exp10f F
GLIBC_2.39 _ZGVsMxv_exp2 F
--
2.27.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 2/6] aarch64: Add vector implementations of acos routines
2023-11-03 12:12 [PATCH 1/6] aarch64: Add vector implementations of asin routines Joe Ramsay
@ 2023-11-03 12:12 ` Joe Ramsay
2023-11-10 17:53 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 3/6] aarch64: Add vector implementations of atan routines Joe Ramsay
` (4 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Joe Ramsay @ 2023-11-03 12:12 UTC (permalink / raw)
To: libc-alpha; +Cc: Joe Ramsay
---
Thanks,
Joe
sysdeps/aarch64/fpu/Makefile | 3 +-
sysdeps/aarch64/fpu/Versions | 4 +
sysdeps/aarch64/fpu/acos_advsimd.c | 122 ++++++++++++++++++
sysdeps/aarch64/fpu/acos_sve.c | 93 +++++++++++++
sysdeps/aarch64/fpu/acosf_advsimd.c | 113 ++++++++++++++++
sysdeps/aarch64/fpu/acosf_sve.c | 86 ++++++++++++
sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
.../fpu/test-double-advsimd-wrappers.c | 1 +
.../aarch64/fpu/test-double-sve-wrappers.c | 1 +
.../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
sysdeps/aarch64/libm-test-ulps | 8 ++
.../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
13 files changed, 440 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/aarch64/fpu/acos_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/acos_sve.c
create mode 100644 sysdeps/aarch64/fpu/acosf_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/acosf_sve.c
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index d7c0bd2ed5..606fdd804f 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,4 +1,5 @@
-libmvec-supported-funcs = asin \
+libmvec-supported-funcs = acos \
+ asin \
cos \
exp \
exp10 \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 0f365a1e2e..1037cd92bd 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -18,6 +18,10 @@ libmvec {
_ZGVsMxv_sinf;
}
GLIBC_2.39 {
+ _ZGVnN4v_acosf;
+ _ZGVnN2v_acos;
+ _ZGVsMxv_acosf;
+ _ZGVsMxv_acos;
_ZGVnN4v_asinf;
_ZGVnN2v_asin;
_ZGVsMxv_asinf;
diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c
new file mode 100644
index 0000000000..3121cf66b1
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acos_advsimd.c
@@ -0,0 +1,122 @@
+/* Double-precision AdvSIMD inverse cos
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+ float64x2_t poly[12];
+ float64x2_t pi, pi_over_2;
+ uint64x2_t abs_mask;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+ V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+ V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+ V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+ V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+ V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+ .pi = V2 (0x1.921fb54442d18p+1),
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define Oneu (0x3ff0000000000000)
+#define Small (0x3e50000000000000) /* 2^-53. */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (acos, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector acos(x).
+
+ For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
+ rounding.
+ If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+ approximation.
+
+ For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.18 ulps,
+ _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
+ want 0x1.0d54d1985c069p+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.52 ulps,
+ _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
+ want 0x1.edbbedf8a7d6cp-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* A single comparison for One, Small and QNaN. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+ v_u64 (Oneu - Small));
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x, x, AllMask);
+#endif
+
+ uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5));
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x),
+ vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax));
+ float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float64x2_t z4 = vmulq_f64 (z2, z2);
+ float64x2_t z8 = vmulq_f64 (z4, z4);
+ float64x2_t z16 = vmulq_f64 (z8, z8);
+ float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
+
+ uint64x2_t is_neg = vcltzq_f64 (x);
+ float64x2_t off = vreinterpretq_f64_u64 (
+ vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi)));
+ float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0));
+ float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off);
+
+ return vfmaq_f64 (add, mul, y);
+}
diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c
new file mode 100644
index 0000000000..1138a04e73
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acos_sve.c
@@ -0,0 +1,93 @@
+/* Double-precision SVE inverse cos
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ float64_t poly[12];
+ float64_t pi, pi_over_2;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
+ 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
+ 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
+ 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
+ .pi = 0x1.921fb54442d18p+1,
+ .pi_over_2 = 0x1.921fb54442d18p+0,
+};
+
+/* Double-precision SVE implementation of vector acos(x).
+
+ For |x| in [0, 0.5], use an order 11 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.18 ulps,
+ _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0
+ want 0x1.0d4d0f55667f7p+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.52 ulps,
+ _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1
+ want 0x1.ed82df4243f0bp-1. */
+svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+ svfloat64_t ax = svabs_x (pg, x);
+
+ svbool_t a_gt_half = svacgt (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ svfloat64_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat64_t z = svsqrt_m (ax, a_gt_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat64_t z4 = svmul_x (pg, z2, z2);
+ svfloat64_t z8 = svmul_x (pg, z4, z4);
+ svfloat64_t z16 = svmul_x (pg, z8, z8);
+ svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ svfloat64_t y
+ = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign));
+
+ svbool_t is_neg = svcmplt (pg, x, 0.0);
+ svfloat64_t off = svdup_f64_z (is_neg, d->pi);
+ svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0));
+ svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2));
+
+ return svmla_x (pg, add, mul, y);
+}
diff --git a/sysdeps/aarch64/fpu/acosf_advsimd.c b/sysdeps/aarch64/fpu/acosf_advsimd.c
new file mode 100644
index 0000000000..7d39e9b805
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acosf_advsimd.c
@@ -0,0 +1,113 @@
+/* Single-precision AdvSIMD inverse cos
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t pi_over_2f, pif;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+ V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+ .pi_over_2f = V4 (0x1.921fb6p+0f),
+ .pif = V4 (0x1.921fb6p+1f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x32800000 /* 2^-26. */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (acosf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector acos(x).
+
+ For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
+ rounding.
+ If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+ approximation.
+
+ For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.26 ulps,
+ _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.32 ulps,
+ _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
+ want 0x1.feb32ep-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+ /* A single comparison for One, Small and QNaN. */
+ uint32x4_t special
+ = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+ float32x4_t ax = vreinterpretq_f32_u32 (ia);
+ uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x),
+ vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+ float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float32x4_t p = v_horner_4_f32 (z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x);
+
+ uint32x4_t is_neg = vcltzq_f32 (x);
+ float32x4_t off = vreinterpretq_f32_u32 (
+ vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg));
+ float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0));
+ float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off);
+
+ return vfmaq_f32 (add, mul, y);
+}
diff --git a/sysdeps/aarch64/fpu/acosf_sve.c b/sysdeps/aarch64/fpu/acosf_sve.c
new file mode 100644
index 0000000000..44253fa999
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acosf_sve.c
@@ -0,0 +1,86 @@
+/* Single-precision SVE inverse cos
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float32_t poly[5];
+ float32_t pi, pi_over_2;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6,
+ 0x1.3af7d8p-5, },
+ .pi = 0x1.921fb6p+1f,
+ .pi_over_2 = 0x1.921fb6p+0f,
+};
+
+/* Single-precision SVE implementation of vector acos(x).
+
+ For |x| in [0, 0.5], use order 4 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.16 ulps,
+ _ZGVsMxv_acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0
+ want 0x1.0c27f6p+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.32 ulps,
+ _ZGVsMxv_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
+ want 0x1.feb32ep-1. */
+svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
+ svfloat32_t ax = svabs_x (pg, x);
+ svbool_t a_gt_half = svacgt (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ svfloat32_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat32_t z = svsqrt_m (ax, a_gt_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ svfloat32_t y
+ = svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (p), sign));
+
+ svbool_t is_neg = svcmplt (pg, x, 0.0);
+ svfloat32_t off = svdup_f32_z (is_neg, d->pi);
+ svfloat32_t mul = svsel (a_gt_half, sv_f32 (2.0), sv_f32 (-1.0));
+ svfloat32_t add = svsel (a_gt_half, off, sv_f32 (d->pi_over_2));
+
+ return svmla_x (pg, add, mul, y);
+}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 03778faf96..f313993d70 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -49,6 +49,7 @@ typedef __SVBool_t __sv_bool_t;
# define __vpcs __attribute__ ((__aarch64_vector_pcs__))
+__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
@@ -60,6 +61,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
@@ -76,6 +78,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
#ifdef __SVE_VEC_MATH_SUPPORTED
+__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
@@ -87,6 +90,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index b5ccd6b1cc..5a0cbf743b 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -23,6 +23,7 @@
#define VEC_TYPE float64x2_t
+VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index fc3b20f421..bd89ff6133 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -32,6 +32,7 @@
return svlastb_f64 (svptrue_b64 (), mr); \
}
+SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 0a36aa91f5..3fafca7557 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -23,6 +23,7 @@
#define VEC_TYPE float32x4_t
+VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index f7e4882c7a..b4ec9f777b 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -32,6 +32,7 @@
return svlastb_f32 (svptrue_b32 (), mr); \
}
+SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 1edc0fc343..c2b6f21b9d 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -6,11 +6,19 @@ double: 1
float: 1
ldouble: 1
+Function: "acos_advsimd":
+double: 1
+float: 1
+
Function: "acos_downward":
double: 1
float: 1
ldouble: 1
+Function: "acos_sve":
+double: 1
+float: 1
+
Function: "acos_towardzero":
double: 1
float: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 6431c3fe65..f79eaaf241 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -14,18 +14,22 @@ GLIBC_2.38 _ZGVsMxv_log F
GLIBC_2.38 _ZGVsMxv_logf F
GLIBC_2.38 _ZGVsMxv_sin F
GLIBC_2.38 _ZGVsMxv_sinf F
+GLIBC_2.39 _ZGVnN2v_acos F
GLIBC_2.39 _ZGVnN2v_asin F
GLIBC_2.39 _ZGVnN2v_exp10 F
GLIBC_2.39 _ZGVnN2v_exp2 F
GLIBC_2.39 _ZGVnN2v_log10 F
GLIBC_2.39 _ZGVnN2v_log2 F
GLIBC_2.39 _ZGVnN2v_tan F
+GLIBC_2.39 _ZGVnN4v_acosf F
GLIBC_2.39 _ZGVnN4v_asinf F
GLIBC_2.39 _ZGVnN4v_exp10f F
GLIBC_2.39 _ZGVnN4v_exp2f F
GLIBC_2.39 _ZGVnN4v_log10f F
GLIBC_2.39 _ZGVnN4v_log2f F
GLIBC_2.39 _ZGVnN4v_tanf F
+GLIBC_2.39 _ZGVsMxv_acos F
+GLIBC_2.39 _ZGVsMxv_acosf F
GLIBC_2.39 _ZGVsMxv_asin F
GLIBC_2.39 _ZGVsMxv_asinf F
GLIBC_2.39 _ZGVsMxv_exp10 F
--
2.27.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 3/6] aarch64: Add vector implementations of atan routines
2023-11-03 12:12 [PATCH 1/6] aarch64: Add vector implementations of asin routines Joe Ramsay
2023-11-03 12:12 ` [PATCH 2/6] aarch64: Add vector implementations of acos routines Joe Ramsay
@ 2023-11-03 12:12 ` Joe Ramsay
2023-11-10 17:54 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 4/6] aarch64: Add vector implementations of atan2 routines Joe Ramsay
` (3 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Joe Ramsay @ 2023-11-03 12:12 UTC (permalink / raw)
To: libc-alpha; +Cc: Joe Ramsay
---
Thanks,
Joe
sysdeps/aarch64/fpu/Makefile | 1 +
sysdeps/aarch64/fpu/Versions | 4 +
sysdeps/aarch64/fpu/atan_advsimd.c | 104 +++++++++++++++++
sysdeps/aarch64/fpu/atan_sve.c | 90 +++++++++++++++
sysdeps/aarch64/fpu/atanf_advsimd.c | 109 ++++++++++++++++++
sysdeps/aarch64/fpu/atanf_sve.c | 79 +++++++++++++
sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
.../fpu/test-double-advsimd-wrappers.c | 1 +
.../aarch64/fpu/test-double-sve-wrappers.c | 1 +
.../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
sysdeps/aarch64/libm-test-ulps | 8 ++
.../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
13 files changed, 407 insertions(+)
create mode 100644 sysdeps/aarch64/fpu/atan_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/atan_sve.c
create mode 100644 sysdeps/aarch64/fpu/atanf_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/atanf_sve.c
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 606fdd804f..5bd77a749d 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,5 +1,6 @@
libmvec-supported-funcs = acos \
asin \
+ atan \
cos \
exp \
exp10 \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 1037cd92bd..dfc3d2dad3 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -26,6 +26,10 @@ libmvec {
_ZGVnN2v_asin;
_ZGVsMxv_asinf;
_ZGVsMxv_asin;
+ _ZGVnN4v_atanf;
+ _ZGVnN2v_atan;
+ _ZGVsMxv_atanf;
+ _ZGVsMxv_atan;
_ZGVnN4v_exp10f;
_ZGVnN2v_exp10;
_ZGVsMxv_exp10f;
diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
new file mode 100644
index 0000000000..d52c07d8a0
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan_advsimd.c
@@ -0,0 +1,104 @@
+/* Double-precision AdvSIMD inverse tan
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+ float64x2_t pi_over_2;
+ float64x2_t poly[20];
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
+ V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
+ V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
+ V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
+ V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
+ V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
+ V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
+ V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
+ V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
+ V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */
+#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */
+
+/* Fast implementation of vector atan.
+ Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+ z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+ _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+ want 0x1.9225645bdd7c3p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Small cases, infs and nans are supported by our approximation technique,
+ but do not set fenv flags correctly. Only trigger special case if we need
+ fenv. */
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t sign = vandq_u64 (ix, SignMask);
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000));
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)),
+ v_u64 (BigBound - TinyBound));
+ /* If any lane is special, fall back to the scalar routine for all lanes. */
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
+ /* Argument reduction:
+ y := arctan(x) for x < 1
+ y := pi/2 + arctan(-1/x) for x > 1
+ Hence, use z=-1/a if x>=1, otherwise z=a. */
+ uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
+ /* Avoid dependency in abs(x) in division (and comparison). */
+ float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
+ float64x2_t shift = vreinterpretq_f64_u64 (
+ vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
+ /* Use absolute value only when needed (odd powers of z). */
+ float64x2_t az = vbslq_f64 (
+ SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
+
+ /* Calculate the polynomial approximation.
+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16.
+ The order 19 polynomial P approximates
+ (atan(sqrt(x))-sqrt(x))/x^(3/2). */
+ float64x2_t z2 = vmulq_f64 (z, z);
+ float64x2_t x2 = vmulq_f64 (z2, z2);
+ float64x2_t x4 = vmulq_f64 (x2, x2);
+ float64x2_t x8 = vmulq_f64 (x4, x4);
+ float64x2_t y
+ = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
+ v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
+
+ /* Finalize. y = shift + z + z^3 * P(z^2). */
+ y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
+ y = vaddq_f64 (y, shift);
+
+ /* y = atan(x) if x>0, -atan(-x) otherwise. */
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c
new file mode 100644
index 0000000000..35587ef212
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan_sve.c
@@ -0,0 +1,90 @@
+/* Double-precision SVE inverse tan
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ float64_t poly[20];
+ float64_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
+ 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+ -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
+ 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+ -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
+ 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+ -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
+ .pi_over_2 = 0x1.921fb54442d18p+0,
+};
+
+/* Useful constants. */
+#define SignMask (0x8000000000000000)
+
+/* Fast implementation of SVE atan.
+ Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+ z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
+ error is 2.27 ulps:
+ _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+ want 0x1.9225645bdd7c3p-1. */
+svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* No need to trigger special case. Small cases, infs and nans
+ are supported by our approximation technique. */
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t sign = svand_x (pg, ix, SignMask);
+
+ /* Argument reduction:
+ y := arctan(x) for x < 1
+ y := pi/2 + arctan(-1/x) for x > 1
+ Hence, use z=-1/a if x>=1, otherwise z=a. */
+ svbool_t red = svacgt (pg, x, 1.0);
+ /* Avoid dependency in abs(x) in division (and comparison). */
+ svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x);
+ /* Use absolute value only when needed (odd powers of z). */
+ svfloat64_t az = svabs_x (pg, z);
+ az = svneg_m (az, red, az);
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
+ svfloat64_t z2 = svmul_x (pg, z, z);
+ svfloat64_t x2 = svmul_x (pg, z2, z2);
+ svfloat64_t x4 = svmul_x (pg, x2, x2);
+ svfloat64_t x8 = svmul_x (pg, x4, x4);
+
+ svfloat64_t y
+ = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly),
+ sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8);
+
+ /* y = shift + z + z^3 * P(z^2). */
+ svfloat64_t z3 = svmul_x (pg, z2, az);
+ y = svmla_x (pg, az, z3, y);
+
+ /* Apply shift as indicated by `red` predicate. */
+ y = svadd_m (red, y, d->pi_over_2);
+
+ /* y = atan(x) if x>0, -atan(-x) otherwise. */
+ y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c
new file mode 100644
index 0000000000..589b0e8c96
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
@@ -0,0 +1,109 @@
+/* Single-precision AdvSIMD inverse tan
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+static const struct data
+{
+ float32x4_t poly[8];
+ float32x4_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0].
+ Generated using fpminimax between FLT_MIN and 1. */
+ .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+ V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+ V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+ .pi_over_2 = V4 (0x1.921fb6p+0f),
+};
+
+#define SignMask v_u32 (0x80000000)
+
+#define P(i) d->poly[i]
+
+#define TinyBound 0x30800000 /* asuint(0x1p-30). */
+#define BigBound 0x4e800000 /* asuint(0x1p30). */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (atanf, x, y, special);
+}
+#endif
+
+/* Fast implementation of vector atanf based on
+ atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+ using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+ _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Small cases, infs and nans are supported by our approximation technique,
+ but do not set fenv flags correctly. Only trigger special case if we need
+ fenv. */
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t sign = vandq_u32 (ix, SignMask);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
+ uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
+ v_u32 (BigBound - TinyBound));
+ /* If any lane is special, fall back to the scalar routine for all lanes. */
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (-1));
+#endif
+
+ /* Argument reduction:
+ y := arctan(x) for x < 1
+ y := pi/2 + arctan(-1/x) for x > 1
+ Hence, use z=-1/a if x>=1, otherwise z=a. */
+ uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
+ /* Avoid dependency in abs(x) in division (and comparison). */
+ float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
+ float32x4_t shift = vreinterpretq_f32_u32 (
+ vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
+ /* Use absolute value only when needed (odd powers of z). */
+ float32x4_t az = vbslq_f32 (
+ SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
+
+ /* Calculate the polynomial approximation.
+ Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+ a standard implementation using z8 creates spurious underflow
+ in the very last fma (when z^8 is small enough).
+ Therefore, we split the last fma into a mul and an fma.
+ Horner and single-level Estrin have higher errors that exceed
+ threshold. */
+ float32x4_t z2 = vmulq_f32 (z, z);
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+
+ float32x4_t y = vfmaq_f32 (
+ v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
+ vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
+
+ /* y = shift + z * P(z^2). */
+ y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
+
+ /* y = atan(x) if x>0, -atan(-x) otherwise. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c
new file mode 100644
index 0000000000..9453e7aa29
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanf_sve.c
@@ -0,0 +1,79 @@
+/* Single-precision SVE inverse tan
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float32_t poly[8];
+ float32_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0]. */
+ .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+ -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
+ .pi_over_2 = 0x1.921fb6p+0f,
+};
+
+#define SignMask (0x80000000)
+
+/* Fast implementation of SVE atanf based on
+ atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+ z=-1/x and shift = pi/2.
+ Largest observed error is 2.9 ULP, close to +/-1.0:
+ _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1
+ want -0x1.967fp-1. */
+svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* No need to trigger special case. Small cases, infs and nans
+ are supported by our approximation technique. */
+ svuint32_t ix = svreinterpret_u32 (x);
+ svuint32_t sign = svand_x (pg, ix, SignMask);
+
+ /* Argument reduction:
+ y := arctan(x) for x < 1
+ y := pi/2 + arctan(-1/x) for x > 1
+ Hence, use z=-1/a if x>=1, otherwise z=a. */
+ svbool_t red = svacgt (pg, x, 1.0f);
+ /* Avoid dependency in abs(x) in division (and comparison). */
+ svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x);
+ /* Use absolute value only when needed (odd powers of z). */
+ svfloat32_t az = svabs_x (pg, z);
+ az = svneg_m (az, red, az);
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
+ svfloat32_t z2 = svmul_x (pg, z, z);
+ svfloat32_t z4 = svmul_x (pg, z2, z2);
+ svfloat32_t z8 = svmul_x (pg, z4, z4);
+
+ svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly);
+
+ /* y = shift + z + z^3 * P(z^2). */
+ svfloat32_t z3 = svmul_x (pg, z2, az);
+ y = svmla_x (pg, az, z3, y);
+
+ /* Apply shift as indicated by 'red' predicate. */
+ y = svadd_m (red, y, sv_f32 (d->pi_over_2));
+
+ /* y = atan(x) if x>0, -atan(-x) otherwise. */
+ return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
+}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index f313993d70..37aa74fe50 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -51,6 +51,7 @@ typedef __SVBool_t __sv_bool_t;
__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
@@ -63,6 +64,7 @@ __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
@@ -80,6 +82,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
@@ -92,6 +95,7 @@ __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 5a0cbf743b..6954fe7435 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -25,6 +25,7 @@
VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
+VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index bd89ff6133..1173d8f9ae 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -34,6 +34,7 @@
SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
+SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 3fafca7557..387efc30f8 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -25,6 +25,7 @@
VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
+VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index b4ec9f777b..dddd4cb213 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -34,6 +34,7 @@
SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
+SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index c2b6f21b9d..24a99e10da 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -121,11 +121,19 @@ double: 1
float: 1
ldouble: 2
+Function: "atan_advsimd":
+double: 1
+float: 1
+
Function: "atan_downward":
double: 1
float: 2
ldouble: 2
+Function: "atan_sve":
+double: 1
+float: 1
+
Function: "atan_towardzero":
double: 1
float: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index f79eaaf241..a2d1b8fb6d 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -16,6 +16,7 @@ GLIBC_2.38 _ZGVsMxv_sin F
GLIBC_2.38 _ZGVsMxv_sinf F
GLIBC_2.39 _ZGVnN2v_acos F
GLIBC_2.39 _ZGVnN2v_asin F
+GLIBC_2.39 _ZGVnN2v_atan F
GLIBC_2.39 _ZGVnN2v_exp10 F
GLIBC_2.39 _ZGVnN2v_exp2 F
GLIBC_2.39 _ZGVnN2v_log10 F
@@ -23,6 +24,7 @@ GLIBC_2.39 _ZGVnN2v_log2 F
GLIBC_2.39 _ZGVnN2v_tan F
GLIBC_2.39 _ZGVnN4v_acosf F
GLIBC_2.39 _ZGVnN4v_asinf F
+GLIBC_2.39 _ZGVnN4v_atanf F
GLIBC_2.39 _ZGVnN4v_exp10f F
GLIBC_2.39 _ZGVnN4v_exp2f F
GLIBC_2.39 _ZGVnN4v_log10f F
@@ -32,6 +34,8 @@ GLIBC_2.39 _ZGVsMxv_acos F
GLIBC_2.39 _ZGVsMxv_acosf F
GLIBC_2.39 _ZGVsMxv_asin F
GLIBC_2.39 _ZGVsMxv_asinf F
+GLIBC_2.39 _ZGVsMxv_atan F
+GLIBC_2.39 _ZGVsMxv_atanf F
GLIBC_2.39 _ZGVsMxv_exp10 F
GLIBC_2.39 _ZGVsMxv_exp10f F
GLIBC_2.39 _ZGVsMxv_exp2 F
--
2.27.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 4/6] aarch64: Add vector implementations of atan2 routines
2023-11-03 12:12 [PATCH 1/6] aarch64: Add vector implementations of asin routines Joe Ramsay
2023-11-03 12:12 ` [PATCH 2/6] aarch64: Add vector implementations of acos routines Joe Ramsay
2023-11-03 12:12 ` [PATCH 3/6] aarch64: Add vector implementations of atan routines Joe Ramsay
@ 2023-11-03 12:12 ` Joe Ramsay
2023-11-10 18:01 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 5/6] aarch64: Add vector implementations of log1p routines Joe Ramsay
` (2 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Joe Ramsay @ 2023-11-03 12:12 UTC (permalink / raw)
To: libc-alpha; +Cc: Joe Ramsay
---
Thanks,
Joe
sysdeps/aarch64/fpu/Makefile | 1 +
sysdeps/aarch64/fpu/Versions | 4 +
sysdeps/aarch64/fpu/atan2_advsimd.c | 121 ++++++++++++++++++
sysdeps/aarch64/fpu/atan2_sve.c | 118 +++++++++++++++++
sysdeps/aarch64/fpu/atan2f_advsimd.c | 116 +++++++++++++++++
sysdeps/aarch64/fpu/atan2f_sve.c | 110 ++++++++++++++++
sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
.../fpu/test-double-advsimd-wrappers.c | 1 +
.../aarch64/fpu/test-double-sve-wrappers.c | 11 ++
.../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 11 ++
.../aarch64/fpu/test-vpcs-vector-wrapper.h | 14 ++
sysdeps/aarch64/fpu/vecmath_config.h | 11 ++
sysdeps/aarch64/libm-test-ulps | 8 ++
.../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
15 files changed, 535 insertions(+)
create mode 100644 sysdeps/aarch64/fpu/atan2_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/atan2_sve.c
create mode 100644 sysdeps/aarch64/fpu/atan2f_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/atan2f_sve.c
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 5bd77a749d..364efbeac1 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,6 +1,7 @@
libmvec-supported-funcs = acos \
asin \
atan \
+ atan2 \
cos \
exp \
exp10 \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index dfc3d2dad3..99492b3d33 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -30,6 +30,10 @@ libmvec {
_ZGVnN2v_atan;
_ZGVsMxv_atanf;
_ZGVsMxv_atan;
+ _ZGVnN4vv_atan2f;
+ _ZGVnN2vv_atan2;
+ _ZGVsMxvv_atan2f;
+ _ZGVsMxvv_atan2;
_ZGVnN4v_exp10f;
_ZGVnN2v_exp10;
_ZGVsMxv_exp10f;
diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
new file mode 100644
index 0000000000..fcc6be0d6c
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
@@ -0,0 +1,121 @@
+/* Double-precision AdvSIMD atan2
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+ float64x2_t pi_over_2;
+ float64x2_t poly[20];
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ the interval [2**-1022, 1.0]. */
+ .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
+ V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
+ V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
+ V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
+ V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
+ V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
+ V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
+ V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
+ V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
+ V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
+{
+ return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline uint64x2_t
+zeroinfnan (uint64x2_t i)
+{
+ /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
+ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
+ v_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of vector atan2.
+ Maximum observed error is 2.8 ulps:
+ _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+ got 0x1.92d628ab678ccp-1
+ want 0x1.92d628ab678cfp-1. */
+float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+{
+ const struct data *data_ptr = ptr_barrier (&data);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t iy = vreinterpretq_u64_f64 (y);
+
+ uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
+
+ uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+ uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+ uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
+
+ float64x2_t ax = vabsq_f64 (x);
+ float64x2_t ay = vabsq_f64 (y);
+
+ uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+ uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
+
+ /* Set up z for call to atan. */
+ float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+ float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
+ float64x2_t z = vdivq_f64 (n, d);
+
+ /* Work out the correct shift. */
+ float64x2_t shift = vreinterpretq_f64_u64 (
+ vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
+ shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+ shift = vmulq_f64 (shift, data_ptr->pi_over_2);
+
+ /* Calculate the polynomial approximation.
+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16.
+ The order 19 polynomial P approximates
+ (atan(sqrt(x))-sqrt(x))/x^(3/2). */
+ float64x2_t z2 = vmulq_f64 (z, z);
+ float64x2_t x2 = vmulq_f64 (z2, z2);
+ float64x2_t x4 = vmulq_f64 (x2, x2);
+ float64x2_t x8 = vmulq_f64 (x4, x4);
+ float64x2_t ret
+ = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
+ v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
+
+ /* Finalize. y = shift + z + z^3 * P(z^2). */
+ ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+ ret = vaddq_f64 (ret, shift);
+
+ /* Account for the sign of x and y. */
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+
+ if (__glibc_unlikely (v_any_u64 (special_cases)))
+ return special_case (y, x, ret, special_cases);
+
+ return ret;
+}
diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c
new file mode 100644
index 0000000000..6dbc2f3769
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2_sve.c
@@ -0,0 +1,118 @@
+/* Double-precision SVE atan2
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ float64_t poly[20];
+ float64_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
+ 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+ -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
+ 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+ -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
+ 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+ -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
+ .pi_over_2 = 0x1.921fb54442d18p+0,
+};
+
+/* Useful constants. */
+#define SignMask sv_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
+static svfloat64_t NOINLINE
+special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
+ const svbool_t cmp)
+{
+ return sv_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation
+ of 0, infinity or nan. */
+static inline svbool_t
+zeroinfnan (svuint64_t i, const svbool_t pg)
+{
+ return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
+ sv_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of SVE atan2. Errors are greatest when y and
+ x are reasonably close together. The greatest observed error is 2.28 ULP:
+ _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+ got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */
+svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
+{
+ const struct data *data_ptr = ptr_barrier (&data);
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t iy = svreinterpret_u64 (y);
+
+ svbool_t cmp_x = zeroinfnan (ix, pg);
+ svbool_t cmp_y = zeroinfnan (iy, pg);
+ svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
+
+ svuint64_t sign_x = svand_x (pg, ix, SignMask);
+ svuint64_t sign_y = svand_x (pg, iy, SignMask);
+ svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svfloat64_t ay = svabs_x (pg, y);
+
+ svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
+ svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
+
+ /* Set up z for call to atan. */
+ svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+ svfloat64_t d = svsel (pred_aygtax, ay, ax);
+ svfloat64_t z = svdiv_x (pg, n, d);
+
+ /* Work out the correct shift. */
+ svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
+ shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ shift = svmul_x (pg, shift, data_ptr->pi_over_2);
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
+ svfloat64_t z2 = svmul_x (pg, z, z);
+ svfloat64_t x2 = svmul_x (pg, z2, z2);
+ svfloat64_t x4 = svmul_x (pg, x2, x2);
+ svfloat64_t x8 = svmul_x (pg, x4, x4);
+
+ svfloat64_t ret = svmla_x (
+ pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly),
+ sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8);
+
+ /* y = shift + z + z^3 * P(z^2). */
+ svfloat64_t z3 = svmul_x (pg, z2, z);
+ ret = svmla_x (pg, z, z3, ret);
+
+ ret = svadd_m (pg, ret, shift);
+
+ /* Account for the sign of x and y. */
+ ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
+
+ if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
+ return special_case (y, x, ret, cmp_xy);
+
+ return ret;
+}
diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
new file mode 100644
index 0000000000..5a5a6202d1
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
@@ -0,0 +1,116 @@
+/* Single-precision AdvSIMD atan2
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+static const struct data
+{
+ float32x4_t poly[8];
+ float32x4_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0].
+ Generated using fpminimax between FLT_MIN and 1. */
+ .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+ V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+ V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+ .pi_over_2 = V4 (0x1.921fb6p+0f),
+};
+
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
+{
+ return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline uint32x4_t
+zeroinfnan (uint32x4_t i)
+{
+ /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
+ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
+ v_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+ 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+ _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+ want 0x1.967f00p-1. */
+float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+{
+ const struct data *data_ptr = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t iy = vreinterpretq_u32_f32 (y);
+
+ uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
+
+ uint32x4_t sign_x = vandq_u32 (ix, SignMask);
+ uint32x4_t sign_y = vandq_u32 (iy, SignMask);
+ uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y);
+
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ay = vabsq_f32 (y);
+
+ uint32x4_t pred_xlt0 = vcltzq_f32 (x);
+ uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
+
+ /* Set up z for call to atanf. */
+ float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+ float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
+ float32x4_t z = vdivq_f32 (n, d);
+
+ /* Work out the correct shift. */
+ float32x4_t shift = vreinterpretq_f32_u32 (
+ vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
+ shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
+ shift = vmulq_f32 (shift, data_ptr->pi_over_2);
+
+ /* Calculate the polynomial approximation.
+ Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+ a standard implementation using z8 creates spurious underflow
+ in the very last fma (when z^8 is small enough).
+ Therefore, we split the last fma into a mul and an fma.
+ Horner and single-level Estrin have higher errors that exceed
+ threshold. */
+ float32x4_t z2 = vmulq_f32 (z, z);
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+
+ float32x4_t ret = vfmaq_f32 (
+ v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
+ vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
+
+ /* y = shift + z * P(z^2). */
+ ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
+
+ /* Account for the sign of y. */
+ ret = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+
+ if (__glibc_unlikely (v_any_u32 (special_cases)))
+ {
+ return special_case (y, x, ret, special_cases);
+ }
+
+ return ret;
+}
diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c
new file mode 100644
index 0000000000..606a62c144
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2f_sve.c
@@ -0,0 +1,110 @@
+/* Single-precision SVE atan2
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float32_t poly[8];
+ float32_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0]. */
+ .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+ -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
+ .pi_over_2 = 0x1.921fb6p+0f,
+};
+
+#define SignMask sv_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
+static inline svfloat32_t
+special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
+ const svbool_t cmp)
+{
+ return sv_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation
+ of 0, infinity or nan. */
+static inline svbool_t
+zeroinfnan (svuint32_t i, const svbool_t pg)
+{
+ return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
+ sv_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 *
+ P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum
+ observed error is 2.95 ULP:
+ _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+ want 0x1.967f00p-1. */
+svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
+{
+ const struct data *data_ptr = ptr_barrier (&data);
+
+ svuint32_t ix = svreinterpret_u32 (x);
+ svuint32_t iy = svreinterpret_u32 (y);
+
+ svbool_t cmp_x = zeroinfnan (ix, pg);
+ svbool_t cmp_y = zeroinfnan (iy, pg);
+ svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
+
+ svuint32_t sign_x = svand_x (pg, ix, SignMask);
+ svuint32_t sign_y = svand_x (pg, iy, SignMask);
+ svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svfloat32_t ay = svabs_x (pg, y);
+
+ svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
+ svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
+
+ /* Set up z for call to atan. */
+ svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+ svfloat32_t d = svsel (pred_aygtax, ay, ax);
+ svfloat32_t z = svdiv_x (pg, n, d);
+
+ /* Work out the correct shift. */
+ svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
+ shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
+ svfloat32_t z2 = svmul_x (pg, z, z);
+ svfloat32_t z4 = svmul_x (pg, z2, z2);
+ svfloat32_t z8 = svmul_x (pg, z4, z4);
+
+ svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly);
+
+ /* ret = shift + z + z^3 * P(z^2). */
+ svfloat32_t z3 = svmul_x (pg, z2, z);
+ ret = svmla_x (pg, z, z3, ret);
+
+ ret = svadd_m (pg, ret, shift);
+
+ /* Account for the sign of x and y. */
+ ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
+
+ if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
+ return special_case (y, x, ret, cmp_xy);
+
+ return ret;
+}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 37aa74fe50..7666c09083 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -49,6 +49,7 @@ typedef __SVBool_t __sv_bool_t;
# define __vpcs __attribute__ ((__aarch64_vector_pcs__))
+__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
@@ -62,6 +63,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
@@ -80,6 +82,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
#ifdef __SVE_VEC_MATH_SUPPORTED
+__sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
@@ -93,6 +96,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 6954fe7435..0ac0240171 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -26,6 +26,7 @@
VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
+VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 1173d8f9ae..5bbc4d58c1 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -32,9 +32,20 @@
return svlastb_f64 (svptrue_b64 (), mr); \
}
+#define SVE_VECTOR_WRAPPER_ff(scalar_func, vector_func) \
+ extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE, svbool_t); \
+ FLOAT scalar_func (FLOAT x, FLOAT y) \
+ { \
+ VEC_TYPE mx = svdup_n_f64 (x); \
+ VEC_TYPE my = svdup_n_f64 (y); \
+ VEC_TYPE mr = vector_func (mx, my, svptrue_b64 ()); \
+ return svlastb_f64 (svptrue_b64 (), mr); \
+ }
+
SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
+SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 387efc30f8..a557bfc3a6 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -26,6 +26,7 @@
VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
+VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index dddd4cb213..f36939e2c4 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -32,9 +32,20 @@
return svlastb_f32 (svptrue_b32 (), mr); \
}
+#define SVE_VECTOR_WRAPPER_ff(scalar_func, vector_func) \
+ extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE, svbool_t); \
+ FLOAT scalar_func (FLOAT x, FLOAT y) \
+ { \
+ VEC_TYPE mx = svdup_n_f32 (x); \
+ VEC_TYPE my = svdup_n_f32 (y); \
+ VEC_TYPE mr = vector_func (mx, my, svptrue_b32 ()); \
+ return svlastb_f32 (svptrue_b32 (), mr); \
+ }
+
SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
+SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
diff --git a/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h b/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h
index f8e6a3fb9d..9551a9ea6f 100644
--- a/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h
+++ b/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h
@@ -29,3 +29,17 @@
TEST_VEC_LOOP (mr, VEC_LEN); \
return ((FLOAT) mr[0]); \
}
+
+#define VPCS_VECTOR_WRAPPER_ff(scalar_func, vector_func) \
+ extern __attribute__ ((aarch64_vector_pcs)) \
+ VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE); \
+ FLOAT scalar_func (FLOAT x, FLOAT y) \
+ { \
+ int i; \
+ VEC_TYPE mx, my; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (my, y, VEC_LEN); \
+ VEC_TYPE mr = vector_func (mx, my); \
+ TEST_VEC_LOOP (mr, VEC_LEN); \
+ return ((FLOAT) mr[0]); \
+ }
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
index 2c8e243236..0e631fbdd5 100644
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -35,6 +35,17 @@
__ptr; \
})
+static inline uint64_t
+asuint64 (double f)
+{
+ union
+ {
+ double f;
+ uint64_t i;
+ } u = { f };
+ return u.i;
+}
+
#define V_LOG_POLY_ORDER 6
#define V_LOG_TABLE_BITS 7
extern const struct v_log_data
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 24a99e10da..e0699c44d8 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -106,11 +106,19 @@ Function: "atan2":
float: 1
ldouble: 2
+Function: "atan2_advsimd":
+double: 1
+float: 2
+
Function: "atan2_downward":
double: 1
float: 2
ldouble: 2
+Function: "atan2_sve":
+double: 1
+float: 2
+
Function: "atan2_towardzero":
double: 1
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index a2d1b8fb6d..7961a2f374 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -22,6 +22,7 @@ GLIBC_2.39 _ZGVnN2v_exp2 F
GLIBC_2.39 _ZGVnN2v_log10 F
GLIBC_2.39 _ZGVnN2v_log2 F
GLIBC_2.39 _ZGVnN2v_tan F
+GLIBC_2.39 _ZGVnN2vv_atan2 F
GLIBC_2.39 _ZGVnN4v_acosf F
GLIBC_2.39 _ZGVnN4v_asinf F
GLIBC_2.39 _ZGVnN4v_atanf F
@@ -30,6 +31,7 @@ GLIBC_2.39 _ZGVnN4v_exp2f F
GLIBC_2.39 _ZGVnN4v_log10f F
GLIBC_2.39 _ZGVnN4v_log2f F
GLIBC_2.39 _ZGVnN4v_tanf F
+GLIBC_2.39 _ZGVnN4vv_atan2f F
GLIBC_2.39 _ZGVsMxv_acos F
GLIBC_2.39 _ZGVsMxv_acosf F
GLIBC_2.39 _ZGVsMxv_asin F
@@ -46,3 +48,5 @@ GLIBC_2.39 _ZGVsMxv_log2 F
GLIBC_2.39 _ZGVsMxv_log2f F
GLIBC_2.39 _ZGVsMxv_tan F
GLIBC_2.39 _ZGVsMxv_tanf F
+GLIBC_2.39 _ZGVsMxvv_atan2 F
+GLIBC_2.39 _ZGVsMxvv_atan2f F
--
2.27.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 5/6] aarch64: Add vector implementations of log1p routines
2023-11-03 12:12 [PATCH 1/6] aarch64: Add vector implementations of asin routines Joe Ramsay
` (2 preceding siblings ...)
2023-11-03 12:12 ` [PATCH 4/6] aarch64: Add vector implementations of atan2 routines Joe Ramsay
@ 2023-11-03 12:12 ` Joe Ramsay
2023-11-10 18:06 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 6/6] aarch64: Add vector implementations of expm1 routines Joe Ramsay
2023-11-10 17:51 ` [PATCH 1/6] aarch64: Add vector implementations of asin routines Szabolcs Nagy
5 siblings, 1 reply; 14+ messages in thread
From: Joe Ramsay @ 2023-11-03 12:12 UTC (permalink / raw)
To: libc-alpha; +Cc: Joe Ramsay
May discard sign of zero.
---
Thanks,
Joe
math/auto-libm-test-in | 2 +-
math/auto-libm-test-out-log1p | 50 +++----
sysdeps/aarch64/fpu/Makefile | 1 +
sysdeps/aarch64/fpu/Versions | 4 +
sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
sysdeps/aarch64/fpu/log1p_advsimd.c | 129 ++++++++++++++++++
sysdeps/aarch64/fpu/log1p_sve.c | 118 ++++++++++++++++
sysdeps/aarch64/fpu/log1pf_advsimd.c | 128 +++++++++++++++++
sysdeps/aarch64/fpu/log1pf_sve.c | 100 ++++++++++++++
.../fpu/test-double-advsimd-wrappers.c | 1 +
.../aarch64/fpu/test-double-sve-wrappers.c | 1 +
.../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
sysdeps/aarch64/libm-test-ulps | 8 ++
.../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
15 files changed, 526 insertions(+), 26 deletions(-)
create mode 100644 sysdeps/aarch64/fpu/log1p_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/log1p_sve.c
create mode 100644 sysdeps/aarch64/fpu/log1pf_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/log1pf_sve.c
diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
index 70892503d6..a8d6674c98 100644
--- a/math/auto-libm-test-in
+++ b/math/auto-libm-test-in
@@ -6577,7 +6577,7 @@ log10 0xf.bf1b2p-4
log10 0x1.6b5f7ap+96
log1p 0
-log1p -0
+log1p -0 no-mathvec
log1p e-1
log1p -0.25
log1p -0.875
diff --git a/math/auto-libm-test-out-log1p b/math/auto-libm-test-out-log1p
index f7d3b35e6d..f83241f51a 100644
--- a/math/auto-libm-test-out-log1p
+++ b/math/auto-libm-test-out-log1p
@@ -23,31 +23,31 @@ log1p 0
= log1p tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok
= log1p towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok
= log1p upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok
-log1p -0
-= log1p downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= log1p upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
+log1p -0 no-mathvec
+= log1p downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= log1p upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
log1p e-1
= log1p downward binary32 0x1.b7e152p+0 : 0x1p+0 : inexact-ok
= log1p tonearest binary32 0x1.b7e152p+0 : 0x1p+0 : inexact-ok
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 364efbeac1..c77c709edd 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -8,6 +8,7 @@ libmvec-supported-funcs = acos \
exp2 \
log \
log10 \
+ log1p \
log2 \
sin \
tan
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 99492b3d33..2543649fbe 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -46,6 +46,10 @@ libmvec {
_ZGVnN2v_log10;
_ZGVsMxv_log10f;
_ZGVsMxv_log10;
+ _ZGVnN4v_log1pf;
+ _ZGVnN2v_log1p;
+ _ZGVsMxv_log1pf;
+ _ZGVsMxv_log1p;
_ZGVnN4v_log2f;
_ZGVnN2v_log2;
_ZGVsMxv_log2f;
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 7666c09083..51915cef22 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -59,6 +59,7 @@ __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
@@ -73,6 +74,7 @@ __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
@@ -92,6 +94,7 @@ __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
@@ -106,6 +109,7 @@ __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
new file mode 100644
index 0000000000..a117e1b6dc
--- /dev/null
+++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
@@ -0,0 +1,129 @@
+/* Double-precision AdvSIMD log1p
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+const static struct data
+{
+ float64x2_t poly[19], ln2[2];
+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
+ int64x2_t one_top;
+} data = {
+ /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
+ .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
+ V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
+ V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
+ V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
+ V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
+ V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
+ V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
+ V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
+ V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
+ V2 (-0x1.cfa7385bdb37ep-6) },
+ .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
+ /* top32(asuint64(sqrt(2)/2)) << 32. */
+ .hf_rt2_top = V2 (0x3fe6a09e00000000),
+ /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
+ .one_m_hf_rt2_top = V2 (0x00095f6200000000),
+ .umask = V2 (0x000fffff00000000),
+ .one_top = V2 (0x3ff),
+ .inf = V2 (0x7ff0000000000000),
+ .minus_one = V2 (0xbff0000000000000)
+};
+
+#define BottomMask v_u64 (0xffffffff)
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (log1p, x, y, special);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is
+ a modification of the algorithm used in scalar log1p, with no shortcut for
+ k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
+ _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
+ want 0x1.fd61d0727429fp+2 . */
+VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ uint64x2_t special = vcgeq_u64 (ia, d->inf);
+
+#if WANT_SIMD_EXCEPT
+ special = vorrq_u64 (special,
+ vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
+ if (__glibc_unlikely (v_any_u64 (special)))
+ x = v_zerofy_f64 (x, special);
+#else
+ special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
+#endif
+
+ /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+ is in [sqrt(2)/2, sqrt(2)]):
+ log1p(x) = k*log(2) + log1p(f).
+
+ f may not be representable exactly, so we need a correction term:
+ let m = round(1 + x), c = (1 + x) - m.
+ c << m: at very small x, log1p(x) ~ x, hence:
+ log(1+x) - log(m) ~ c/m.
+
+ We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
+
+ /* Obtain correctly scaled k by manipulation in the exponent.
+ The scalar algorithm casts down to 32-bit at this point to calculate k and
+ u_red. We stay in double-width to obtain f and k, using the same constants
+ as the scalar algorithm but shifted left by 32. */
+ float64x2_t m = vaddq_f64 (x, v_f64 (1));
+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+
+ int64x2_t ki
+ = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+ float64x2_t k = vcvtq_f64_s64 (ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
+
+ /* Correction term c/m. */
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
+
+ /* Approximate log1p(x) on the reduced input using a polynomial. Because
+ log1p(0)=0 we choose an approximation of the form:
+ x + C0*x^2 + C1*x^3 + C2x^4 + ...
+ Hence approximation has the form f + f^2 * P(f)
+ where P(x) = C0 + C1*x + C2x^2 + ...
+ Assembling this all correctly is dealt with at the final step. */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
+
+ float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+ float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+ float64x2_t y = vaddq_f64 (ylo, yhi);
+
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
+ special);
+
+ return vfmaq_f64 (y, f2, p);
+}
diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
new file mode 100644
index 0000000000..169156748d
--- /dev/null
+++ b/sysdeps/aarch64/fpu/log1p_sve.c
@@ -0,0 +1,118 @@
+/* Double-precision SVE log1p
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ double poly[19];
+ double ln2_hi, ln2_lo;
+ uint64_t hfrt2_top, onemhfrt2_top, inf, mone;
+} data = {
+ /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20
+ polynomial, however first 2 coefficients are 0 and 1 so are not stored. */
+ .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+ 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+ -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+ 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+ -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+ 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+ -0x1.cfa7385bdb37ep-6, },
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ /* top32(asuint64(sqrt(2)/2)) << 32. */
+ .hfrt2_top = 0x3fe6a09e00000000,
+ /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
+ .onemhfrt2_top = 0x00095f6200000000,
+ .inf = 0x7ff0000000000000,
+ .mone = 0xbff0000000000000,
+};
+
+#define AbsMask 0x7fffffffffffffff
+#define BottomMask 0xffffffff
+
+static svfloat64_t NOINLINE
+special_case (svbool_t special, svfloat64_t x, svfloat64_t y)
+{
+ return sv_call_f64 (log1p, x, y, special);
+}
+
+/* Vector approximation for log1p using polynomial on reduced interval. Maximum
+ observed error is 2.46 ULP:
+ _ZGVsMxv_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
+ want 0x1.fd5565fb590f6p+2. */
+svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t ax = svand_x (pg, ix, AbsMask);
+ svbool_t special
+ = svorr_z (pg, svcmpge (pg, ax, d->inf), svcmpge (pg, ix, d->mone));
+
+ /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+ is in [sqrt(2)/2, sqrt(2)]):
+ log1p(x) = k*log(2) + log1p(f).
+
+ f may not be representable exactly, so we need a correction term:
+ let m = round(1 + x), c = (1 + x) - m.
+ c << m: at very small x, log1p(x) ~ x, hence:
+ log(1+x) - log(m) ~ c/m.
+
+ We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
+
+ /* Obtain correctly scaled k by manipulation in the exponent.
+ The scalar algorithm casts down to 32-bit at this point to calculate k and
+ u_red. We stay in double-width to obtain f and k, using the same constants
+ as the scalar algorithm but shifted left by 32. */
+ svfloat64_t m = svadd_x (pg, x, 1);
+ svuint64_t mi = svreinterpret_u64 (m);
+ svuint64_t u = svadd_x (pg, mi, d->onemhfrt2_top);
+
+ svint64_t ki = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), 0x3ff);
+ svfloat64_t k = svcvt_f64_x (pg, ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ svuint64_t utop
+ = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top);
+ svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask));
+ svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
+
+ /* Correction term c/m. */
+ svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m);
+
+ /* Approximate log1p(x) on the reduced input using a polynomial. Because
+ log1p(0)=0 we choose an approximation of the form:
+ x + C0*x^2 + C1*x^3 + C2x^4 + ...
+ Hence approximation has the form f + f^2 * P(f)
+ where P(x) = C0 + C1*x + C2x^2 + ...
+ Assembling this all correctly is dealt with at the final step. */
+ svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2),
+ f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8);
+ svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly);
+
+ svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo);
+ svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi);
+ svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (special, x, y);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
new file mode 100644
index 0000000000..3748830de8
--- /dev/null
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
@@ -0,0 +1,128 @@
+/* Single-precision AdvSIMD log1p
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+const static struct data
+{
+ float32x4_t poly[8], ln2;
+ uint32x4_t tiny_bound, minus_one, four, thresh;
+ int32x4_t three_quarters;
+} data = {
+ .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+ (1, -0.5) are not stored as they can be generated more
+ efficiently. */
+ V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
+ V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
+ V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
+ .ln2 = V4 (0x1.62e43p-1f),
+ .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
+ .minus_one = V4 (0xbf800000),
+ .four = V4 (0x40800000),
+ .three_quarters = V4 (0x3f400000)
+};
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const float32x4_t *p)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
+ float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
+ float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
+ float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
+ float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
+
+ float32x4_t m2 = vmulq_f32 (m, m);
+ float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
+ float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
+ float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
+
+ float32x4_t m4 = vmulq_f32 (m2, m2);
+ float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
+ return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
+}
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (log1pf, x, y, special);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
+ is roughly 2.02 ULP:
+ log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ uint32x4_t special_cases
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
+ vcgeq_u32 (ix, d->minus_one));
+ float32x4_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+ if (__glibc_unlikely (v_any_u32 (special_cases)))
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ x = v_zerofy_f32 (x, special_cases);
+#endif
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+
+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ int32x4_t k
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+ v_s32 (0xff800000));
+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
+
+ /* Scale x by exponent manipulation. */
+ float32x4_t m_scale
+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+
+ /* Evaluate polynomial on the reduced interval. */
+ float32x4_t p = eval_poly (m_scale, d->poly);
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
+
+ /* Apply the scaling back. */
+ float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
+
+ if (__glibc_unlikely (v_any_u32 (special_cases)))
+ return special_case (special_arg, y, special_cases);
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
new file mode 100644
index 0000000000..712f62b9ce
--- /dev/null
+++ b/sysdeps/aarch64/fpu/log1pf_sve.c
@@ -0,0 +1,100 @@
+/* Single-precision SVE log1p
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float poly[8];
+ float ln2, exp_bias;
+ uint32_t four, three_quarters;
+} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
+ this can be fmov-ed directly instead of including it in
+ the main load-and-mla polynomial schedule. */
+ 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+ -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
+ 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
+ .ln2 = 0x1.62e43p-1f,
+ .exp_bias = 0x1p-23f,
+ .four = 0x40800000,
+ .three_quarters = 0x3f400000};
+
+#define SignExponentMask 0xff800000
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (log1pf, x, y, special);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.27 ULP very close to 0.5.
+ _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2
+ want 0x1.9f323ep-2. */
+svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* x < -1, Inf/Nan. */
+ svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
+ special = svorn_z (pg, special, svcmpge (pg, x, -1));
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+ svfloat32_t m = svadd_x (pg, x, 1);
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ svint32_t k
+ = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+ sv_s32 (SignExponentMask));
+
+ /* Scale x by exponent manipulation. */
+ svfloat32_t m_scale = svreinterpret_f32 (
+ svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+ m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
+
+ /* Evaluate polynomial on reduced interval. */
+ svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
+ ms4 = svmul_x (pg, ms2, ms2);
+ svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
+ p = svmad_x (pg, m_scale, p, -0.5);
+ p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
+
+ /* Apply the scaling back. */
+ svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 0ac0240171..fc9e7aec47 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
+VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 5bbc4d58c1..aea589d5fb 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
+SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index a557bfc3a6..446fd7f538 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
+VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index f36939e2c4..ac17f60856 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
+SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index e0699c44d8..a6b2f29a6f 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1248,11 +1248,19 @@ double: 1
float: 1
ldouble: 3
+Function: "log1p_advsimd":
+double: 1
+float: 1
+
Function: "log1p_downward":
double: 1
float: 2
ldouble: 3
+Function: "log1p_sve":
+double: 1
+float: 1
+
Function: "log1p_towardzero":
double: 2
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 7961a2f374..0f20b5be29 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -20,6 +20,7 @@ GLIBC_2.39 _ZGVnN2v_atan F
GLIBC_2.39 _ZGVnN2v_exp10 F
GLIBC_2.39 _ZGVnN2v_exp2 F
GLIBC_2.39 _ZGVnN2v_log10 F
+GLIBC_2.39 _ZGVnN2v_log1p F
GLIBC_2.39 _ZGVnN2v_log2 F
GLIBC_2.39 _ZGVnN2v_tan F
GLIBC_2.39 _ZGVnN2vv_atan2 F
@@ -29,6 +30,7 @@ GLIBC_2.39 _ZGVnN4v_atanf F
GLIBC_2.39 _ZGVnN4v_exp10f F
GLIBC_2.39 _ZGVnN4v_exp2f F
GLIBC_2.39 _ZGVnN4v_log10f F
+GLIBC_2.39 _ZGVnN4v_log1pf F
GLIBC_2.39 _ZGVnN4v_log2f F
GLIBC_2.39 _ZGVnN4v_tanf F
GLIBC_2.39 _ZGVnN4vv_atan2f F
@@ -44,6 +46,8 @@ GLIBC_2.39 _ZGVsMxv_exp2 F
GLIBC_2.39 _ZGVsMxv_exp2f F
GLIBC_2.39 _ZGVsMxv_log10 F
GLIBC_2.39 _ZGVsMxv_log10f F
+GLIBC_2.39 _ZGVsMxv_log1p F
+GLIBC_2.39 _ZGVsMxv_log1pf F
GLIBC_2.39 _ZGVsMxv_log2 F
GLIBC_2.39 _ZGVsMxv_log2f F
GLIBC_2.39 _ZGVsMxv_tan F
--
2.27.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 6/6] aarch64: Add vector implementations of expm1 routines
2023-11-03 12:12 [PATCH 1/6] aarch64: Add vector implementations of asin routines Joe Ramsay
` (3 preceding siblings ...)
2023-11-03 12:12 ` [PATCH 5/6] aarch64: Add vector implementations of log1p routines Joe Ramsay
@ 2023-11-03 12:12 ` Joe Ramsay
2023-11-06 13:31 ` Adhemerval Zanella Netto
2023-11-10 18:11 ` Szabolcs Nagy
2023-11-10 17:51 ` [PATCH 1/6] aarch64: Add vector implementations of asin routines Szabolcs Nagy
5 siblings, 2 replies; 14+ messages in thread
From: Joe Ramsay @ 2023-11-03 12:12 UTC (permalink / raw)
To: libc-alpha; +Cc: Joe Ramsay
---
Thanks,
Joe
sysdeps/aarch64/fpu/Makefile | 1 +
sysdeps/aarch64/fpu/Versions | 4 +
sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
sysdeps/aarch64/fpu/expm1_advsimd.c | 122 ++++++++++++++++++
sysdeps/aarch64/fpu/expm1_sve.c | 99 ++++++++++++++
sysdeps/aarch64/fpu/expm1f_advsimd.c | 117 +++++++++++++++++
sysdeps/aarch64/fpu/expm1f_sve.c | 99 ++++++++++++++
.../fpu/test-double-advsimd-wrappers.c | 1 +
.../aarch64/fpu/test-double-sve-wrappers.c | 1 +
.../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
sysdeps/aarch64/libm-test-ulps | 8 ++
.../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
13 files changed, 462 insertions(+)
create mode 100644 sysdeps/aarch64/fpu/expm1_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/expm1_sve.c
create mode 100644 sysdeps/aarch64/fpu/expm1f_advsimd.c
create mode 100644 sysdeps/aarch64/fpu/expm1f_sve.c
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index c77c709edd..1fe4b52682 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -6,6 +6,7 @@ libmvec-supported-funcs = acos \
exp \
exp10 \
exp2 \
+ expm1 \
log \
log10 \
log1p \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 2543649fbe..aaacacaebe 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -42,6 +42,10 @@ libmvec {
_ZGVnN2v_exp2;
_ZGVsMxv_exp2f;
_ZGVsMxv_exp2;
+ _ZGVnN4v_expm1f;
+ _ZGVnN2v_expm1;
+ _ZGVsMxv_expm1f;
+ _ZGVsMxv_expm1;
_ZGVnN4v_log10f;
_ZGVnN2v_log10;
_ZGVsMxv_log10f;
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 51915cef22..52aad95e3b 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -57,6 +57,7 @@ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
@@ -72,6 +73,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
@@ -92,6 +94,7 @@ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
@@ -107,6 +110,7 @@ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
new file mode 100644
index 0000000000..a3aed8e35b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
@@ -0,0 +1,122 @@
+/* Double-precision AdvSIMD expm1
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+ float64x2_t poly[11];
+ float64x2_t invln2, ln2_lo, ln2_hi, shift;
+ int64x2_t exponent_bias;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t thresh, tiny_bound;
+#else
+ float64x2_t oflow_bound;
+#endif
+} data = {
+ /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
+ .invln2 = V2 (0x1.71547652b82fep0),
+ .ln2_hi = V2 (0x1.62e42fefa39efp-1),
+ .ln2_lo = V2 (0x1.abc9e3b39803fp-56),
+ .shift = V2 (0x1.8p52),
+ .exponent_bias = V2 (0x3ff0000000000000),
+#if WANT_SIMD_EXCEPT
+ /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
+ compare. */
+ .thresh = V2 (0x78c56fa6d34b552),
+ /* asuint64(0x1p-51) << 1. */
+ .tiny_bound = V2 (0x3cc0000000000000 << 1),
+#else
+ /* Value above which expm1(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
+#endif
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (expm1, x, y, special);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+ The maximum error observed error is 2.18 ULP:
+ _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+ want 0x1.a8b9ea8d66e2p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint64x2_t special
+ = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
+ if (__glibc_unlikely (v_any_u64 (special)))
+ x = v_zerofy_f64 (x, special);
+#else
+ /* Large input, NaNs and Infs. */
+ uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound));
+#endif
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
+ int64x2_t i = vcvtq_s64_f64 (n);
+ float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi);
+ f = vfmsq_f64 (f, n, d->ln2_lo);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t f8 = vmulq_f64 (f4, f4);
+ float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+ float64x2_t t = vreinterpretq_f64_s64 (u);
+
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (vreinterpretq_f64_u64 (ix),
+ vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
+ special);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
new file mode 100644
index 0000000000..50646aff7c
--- /dev/null
+++ b/sysdeps/aarch64/fpu/expm1_sve.c
@@ -0,0 +1,99 @@
+/* Double-precision SVE expm1
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+#define SpecialBound 0x1.62b7d369a5aa9p+9
+#define ExponentBias 0x3ff0000000000000
+
+static const struct data
+{
+ double poly[11];
+ double shift, inv_ln2, special_bound;
+ /* To be loaded in one quad-word. */
+ double ln2_hi, ln2_lo;
+} data = {
+ /* Generated using fpminimax. */
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
+ 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+ .special_bound = SpecialBound,
+ .inv_ln2 = 0x1.71547652b82fep0,
+ .ln2_hi = 0x1.62e42fefa39efp-1,
+ .ln2_lo = 0x1.abc9e3b39803fp-56,
+ .shift = 0x1.8p52,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
+{
+ return sv_call_f64 (expm1, x, y, pg);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+ The maximum error observed error is 2.18 ULP:
+ _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+ want 0x1.a8b9ea8d66e2p-2. */
+svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Large, Nan/Inf. */
+ svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ svfloat64_t shift = sv_f64 (d->shift);
+ svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
+ svint64_t i = svcvt_s64_x (pg, n);
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t f = svmls_lane (x, n, ln2, 0);
+ f = svmls_lane (f, n, ln2, 1);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
+ svfloat64_t f8 = svmul_x (pg, f4, f4);
+ svfloat64_t p
+ = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
+ svfloat64_t t = svreinterpret_f64 (u);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
new file mode 100644
index 0000000000..b27b75068a
--- /dev/null
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@@ -0,0 +1,117 @@
+/* Single-precision AdvSIMD expm1
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t invln2, ln2_lo, ln2_hi, shift;
+ int32x4_t exponent_bias;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t thresh;
+#else
+ float32x4_t oflow_bound;
+#endif
+} data = {
+ /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
+ .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
+ V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
+ .invln2 = V4 (0x1.715476p+0f),
+ .ln2_hi = V4 (0x1.62e4p-1f),
+ .ln2_lo = V4 (0x1.7f7d1cp-20f),
+ .shift = V4 (0x1.8p23f),
+ .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+ /* Value above which expm1f(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V4 (0x1.5ebc4p+6),
+#else
+ /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
+ compare. */
+ .thresh = V4 (0x1d5ebc40),
+#endif
+};
+
+/* asuint(0x1p-23), shifted by 1 for abs compare. */
+#define TinyBound v_u32 (0x34000000 << 1)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (expm1f, x, y, special);
+}
+
+/* Single-precision vector exp(x) - 1 function.
+ The maximum error is 1.51 ULP:
+ _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
+ want 0x1.e2fb94p-2. */
+float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint32x4_t special
+ = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
+ if (__glibc_unlikely (v_any_u32 (special)))
+ x = v_zerofy_f32 (x, special);
+#else
+ /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
+ uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound));
+#endif
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift);
+ int32x4_t i = vcvtq_s32_f32 (j);
+ float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi);
+ f = vfmsq_f32 (f, j, d->ln2_lo);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ float32x4_t p = v_horner_4_f32 (f, d->poly);
+ p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+ float32x4_t t = vreinterpretq_f32_s32 (u);
+
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (vreinterpretq_f32_u32 (ix),
+ vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
+ special);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+}
diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
new file mode 100644
index 0000000000..96e579e5b7
--- /dev/null
+++ b/sysdeps/aarch64/fpu/expm1f_sve.c
@@ -0,0 +1,99 @@
+/* Single-precision SVE expm1
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+/* Largest value of x for which expm1(x) should round to -1. */
+#define SpecialBound 0x1.5ebc4p+6f
+
+static const struct data
+{
+ /* These 4 are grouped together so they can be loaded as one quadword, then
+ used with _lane forms of svmla/svmls. */
+ float c2, c4, ln2_hi, ln2_lo;
+ float c0, c1, c3, inv_ln2, special_bound, shift;
+} data = {
+ /* Generated using fpminimax. */
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
+ .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
+ .c4 = 0x1.6b55a2p-10,
+
+ .special_bound = SpecialBound, .shift = 0x1.8p23f,
+ .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
+};
+
+#define C(i) sv_f32 (d->c##i)
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t pg)
+{
+ return sv_call_f32 (expm1f, x, x, pg);
+}
+
+/* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP:
+ _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2
+ want 0x1.e859d4p-2. */
+svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Large, NaN/Inf. */
+ svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, pg);
+
+ /* This vector is reliant on layout of data - it contains constants
+ that can be used with _lane forms of svmla/svmls. Values are:
+ [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
+ svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+ j = svsub_x (pg, j, d->shift);
+ svint32_t i = svcvt_s32_x (pg, j);
+
+ svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+ f = svmls_lane (f, j, lane_constants, 3);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
+ p = svmla_x (pg, C (0), f, p);
+ p = svmla_x (pg, f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ svfloat32_t t = svreinterpret_f32 (
+ svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index fc9e7aec47..bf495450d7 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -31,6 +31,7 @@ VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
+VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index aea589d5fb..b5a627ad47 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -50,6 +50,7 @@ SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
+SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 446fd7f538..26d9e98739 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -31,6 +31,7 @@ VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
+VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index ac17f60856..f286ee64c9 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -50,6 +50,7 @@ SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
+SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index a6b2f29a6f..1d52bf9ebf 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1078,11 +1078,19 @@ double: 1
float: 1
ldouble: 2
+Function: "expm1_advsimd":
+double: 2
+float: 1
+
Function: "expm1_downward":
double: 1
float: 1
ldouble: 2
+Function: "expm1_sve":
+double: 2
+float: 1
+
Function: "expm1_towardzero":
double: 1
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 0f20b5be29..2bf4ea6332 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -19,6 +19,7 @@ GLIBC_2.39 _ZGVnN2v_asin F
GLIBC_2.39 _ZGVnN2v_atan F
GLIBC_2.39 _ZGVnN2v_exp10 F
GLIBC_2.39 _ZGVnN2v_exp2 F
+GLIBC_2.39 _ZGVnN2v_expm1 F
GLIBC_2.39 _ZGVnN2v_log10 F
GLIBC_2.39 _ZGVnN2v_log1p F
GLIBC_2.39 _ZGVnN2v_log2 F
@@ -29,6 +30,7 @@ GLIBC_2.39 _ZGVnN4v_asinf F
GLIBC_2.39 _ZGVnN4v_atanf F
GLIBC_2.39 _ZGVnN4v_exp10f F
GLIBC_2.39 _ZGVnN4v_exp2f F
+GLIBC_2.39 _ZGVnN4v_expm1f F
GLIBC_2.39 _ZGVnN4v_log10f F
GLIBC_2.39 _ZGVnN4v_log1pf F
GLIBC_2.39 _ZGVnN4v_log2f F
@@ -44,6 +46,8 @@ GLIBC_2.39 _ZGVsMxv_exp10 F
GLIBC_2.39 _ZGVsMxv_exp10f F
GLIBC_2.39 _ZGVsMxv_exp2 F
GLIBC_2.39 _ZGVsMxv_exp2f F
+GLIBC_2.39 _ZGVsMxv_expm1 F
+GLIBC_2.39 _ZGVsMxv_expm1f F
GLIBC_2.39 _ZGVsMxv_log10 F
GLIBC_2.39 _ZGVsMxv_log10f F
GLIBC_2.39 _ZGVsMxv_log1p F
--
2.27.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 6/6] aarch64: Add vector implementations of expm1 routines
2023-11-03 12:12 ` [PATCH 6/6] aarch64: Add vector implementations of expm1 routines Joe Ramsay
@ 2023-11-06 13:31 ` Adhemerval Zanella Netto
2023-11-10 18:11 ` Szabolcs Nagy
1 sibling, 0 replies; 14+ messages in thread
From: Adhemerval Zanella Netto @ 2023-11-06 13:31 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
This has triggered a regression on the aarch64 patchwork [1]:
FAIL: math/test-float-advsimd-expm1
original exit status 1
testing float (vector length 4)
Failure: Test: expm1_advsimd (-0)
Result:
is: 0.00000000e+00 0x0.000000p+0
should be: -0.00000000e+00 -0x0.000000p+0
difference: 0.00000000e+00 0x0.000000p+0
ulp : 0.0000
max.ulp : 1.0000
Test suite completed:
96 test cases plus 0 tests for exception flags and
0 tests for errno executed.
1 errors occurred.
FAIL: math/test-double-advsimd-expm1
original exit status 1
testing double (vector length 2)
Failure: Test: expm1_advsimd (-0)
Result:
is: 0.0000000000000000e+00 0x0.0000000000000p+0
should be: -0.0000000000000000e+00 -0x0.0000000000000p+0
difference: 0.0000000000000000e+00 0x0.0000000000000p+0
ulp : 0.0000
max.ulp : 2.0000
Test suite completed:
119 test cases plus 0 tests for exception flags and
0 tests for errno executed.
1 errors occurred.
I think you will need to do something similar to 5bc9b3a1f6a003f6456f717b590615ea98e2d6fb
and remove the negative zero checks for expm1 as well.
[1] https://ci.linaro.org/job/tcwg_glibc_check--master-aarch64-precommit/965/artifact/artifacts/artifacts.precommit/notify/mail-body.txt
On 03/11/23 09:12, Joe Ramsay wrote:
> ---
> Thanks,
> Joe
> sysdeps/aarch64/fpu/Makefile | 1 +
> sysdeps/aarch64/fpu/Versions | 4 +
> sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
> sysdeps/aarch64/fpu/expm1_advsimd.c | 122 ++++++++++++++++++
> sysdeps/aarch64/fpu/expm1_sve.c | 99 ++++++++++++++
> sysdeps/aarch64/fpu/expm1f_advsimd.c | 117 +++++++++++++++++
> sysdeps/aarch64/fpu/expm1f_sve.c | 99 ++++++++++++++
> .../fpu/test-double-advsimd-wrappers.c | 1 +
> .../aarch64/fpu/test-double-sve-wrappers.c | 1 +
> .../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
> sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
> sysdeps/aarch64/libm-test-ulps | 8 ++
> .../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
> 13 files changed, 462 insertions(+)
> create mode 100644 sysdeps/aarch64/fpu/expm1_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/expm1_sve.c
> create mode 100644 sysdeps/aarch64/fpu/expm1f_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/expm1f_sve.c
>
> diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
> index c77c709edd..1fe4b52682 100644
> --- a/sysdeps/aarch64/fpu/Makefile
> +++ b/sysdeps/aarch64/fpu/Makefile
> @@ -6,6 +6,7 @@ libmvec-supported-funcs = acos \
> exp \
> exp10 \
> exp2 \
> + expm1 \
> log \
> log10 \
> log1p \
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index 2543649fbe..aaacacaebe 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -42,6 +42,10 @@ libmvec {
> _ZGVnN2v_exp2;
> _ZGVsMxv_exp2f;
> _ZGVsMxv_exp2;
> + _ZGVnN4v_expm1f;
> + _ZGVnN2v_expm1;
> + _ZGVsMxv_expm1f;
> + _ZGVsMxv_expm1;
> _ZGVnN4v_log10f;
> _ZGVnN2v_log10;
> _ZGVsMxv_log10f;
> diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
> index 51915cef22..52aad95e3b 100644
> --- a/sysdeps/aarch64/fpu/bits/math-vector.h
> +++ b/sysdeps/aarch64/fpu/bits/math-vector.h
> @@ -57,6 +57,7 @@ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
> +__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
> @@ -72,6 +73,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
> +__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
> @@ -92,6 +94,7 @@ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
> +__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
> @@ -107,6 +110,7 @@ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
> +__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
> diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
> new file mode 100644
> index 0000000000..a3aed8e35b
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
> @@ -0,0 +1,122 @@
> +/* Double-precision AdvSIMD expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f64.h"
> +
> +static const struct data
> +{
> + float64x2_t poly[11];
> + float64x2_t invln2, ln2_lo, ln2_hi, shift;
> + int64x2_t exponent_bias;
> +#if WANT_SIMD_EXCEPT
> + uint64x2_t thresh, tiny_bound;
> +#else
> + float64x2_t oflow_bound;
> +#endif
> +} data = {
> + /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
> + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
> + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
> + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
> + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
> + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
> + .invln2 = V2 (0x1.71547652b82fep0),
> + .ln2_hi = V2 (0x1.62e42fefa39efp-1),
> + .ln2_lo = V2 (0x1.abc9e3b39803fp-56),
> + .shift = V2 (0x1.8p52),
> + .exponent_bias = V2 (0x3ff0000000000000),
> +#if WANT_SIMD_EXCEPT
> + /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
> + compare. */
> + .thresh = V2 (0x78c56fa6d34b552),
> + /* asuint64(0x1p-51) << 1. */
> + .tiny_bound = V2 (0x3cc0000000000000 << 1),
> +#else
> + /* Value above which expm1(x) should overflow. Absolute value of the
> + underflow bound is greater than this, so it catches both cases - there is
> + a small window where fallbacks are triggered unnecessarily. */
> + .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
> +#endif
> +};
> +
> +static float64x2_t VPCS_ATTR NOINLINE
> +special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
> +{
> + return v_call_f64 (expm1, x, y, special);
> +}
> +
> +/* Double-precision vector exp(x) - 1 function.
> + The maximum error observed error is 2.18 ULP:
> + _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
> + want 0x1.a8b9ea8d66e2p-2. */
> +float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + uint64x2_t ix = vreinterpretq_u64_f64 (x);
> +
> +#if WANT_SIMD_EXCEPT
> + /* If fp exceptions are to be triggered correctly, fall back to scalar for
> + |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
> + shift-left by 1, and compare with thresh which was left-shifted offline -
> + this is effectively an absolute compare. */
> + uint64x2_t special
> + = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
> + if (__glibc_unlikely (v_any_u64 (special)))
> + x = v_zerofy_f64 (x, special);
> +#else
> + /* Large input, NaNs and Infs. */
> + uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound));
> +#endif
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
> + int64x2_t i = vcvtq_s64_f64 (n);
> + float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi);
> + f = vfmsq_f64 (f, n, d->ln2_lo);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + float64x2_t f2 = vmulq_f64 (f, f);
> + float64x2_t f4 = vmulq_f64 (f2, f2);
> + float64x2_t f8 = vmulq_f64 (f4, f4);
> + float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
> + float64x2_t t = vreinterpretq_f64_s64 (u);
> +
> + if (__glibc_unlikely (v_any_u64 (special)))
> + return special_case (vreinterpretq_f64_u64 (ix),
> + vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
> + special);
> +
> + /* expm1(x) ~= p * t + (t - 1). */
> + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
> +}
> diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
> new file mode 100644
> index 0000000000..50646aff7c
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1_sve.c
> @@ -0,0 +1,99 @@
> +/* Double-precision SVE expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f64.h"
> +
> +#define SpecialBound 0x1.62b7d369a5aa9p+9
> +#define ExponentBias 0x3ff0000000000000
> +
> +static const struct data
> +{
> + double poly[11];
> + double shift, inv_ln2, special_bound;
> + /* To be loaded in one quad-word. */
> + double ln2_hi, ln2_lo;
> +} data = {
> + /* Generated using fpminimax. */
> + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
> + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
> + 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
> + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
> +
> + .special_bound = SpecialBound,
> + .inv_ln2 = 0x1.71547652b82fep0,
> + .ln2_hi = 0x1.62e42fefa39efp-1,
> + .ln2_lo = 0x1.abc9e3b39803fp-56,
> + .shift = 0x1.8p52,
> +};
> +
> +static svfloat64_t NOINLINE
> +special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
> +{
> + return sv_call_f64 (expm1, x, y, pg);
> +}
> +
> +/* Double-precision vector exp(x) - 1 function.
> + The maximum error observed error is 2.18 ULP:
> + _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
> + want 0x1.a8b9ea8d66e2p-2. */
> +svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* Large, Nan/Inf. */
> + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + svfloat64_t shift = sv_f64 (d->shift);
> + svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
> + svint64_t i = svcvt_s64_x (pg, n);
> + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
> + svfloat64_t f = svmls_lane (x, n, ln2, 0);
> + f = svmls_lane (f, n, ln2, 1);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + svfloat64_t f2 = svmul_x (pg, f, f);
> + svfloat64_t f4 = svmul_x (pg, f2, f2);
> + svfloat64_t f8 = svmul_x (pg, f4, f4);
> + svfloat64_t p
> + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
> + svfloat64_t t = svreinterpret_f64 (u);
> +
> + /* expm1(x) ~= p * t + (t - 1). */
> + svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
> +
> + if (__glibc_unlikely (svptest_any (pg, special)))
> + return special_case (x, y, special);
> +
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> new file mode 100644
> index 0000000000..b27b75068a
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> @@ -0,0 +1,117 @@
> +/* Single-precision AdvSIMD expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f32.h"
> +
> +static const struct data
> +{
> + float32x4_t poly[5];
> + float32x4_t invln2, ln2_lo, ln2_hi, shift;
> + int32x4_t exponent_bias;
> +#if WANT_SIMD_EXCEPT
> + uint32x4_t thresh;
> +#else
> + float32x4_t oflow_bound;
> +#endif
> +} data = {
> + /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
> + .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
> + V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
> + .invln2 = V4 (0x1.715476p+0f),
> + .ln2_hi = V4 (0x1.62e4p-1f),
> + .ln2_lo = V4 (0x1.7f7d1cp-20f),
> + .shift = V4 (0x1.8p23f),
> + .exponent_bias = V4 (0x3f800000),
> +#if !WANT_SIMD_EXCEPT
> + /* Value above which expm1f(x) should overflow. Absolute value of the
> + underflow bound is greater than this, so it catches both cases - there is
> + a small window where fallbacks are triggered unnecessarily. */
> + .oflow_bound = V4 (0x1.5ebc4p+6),
> +#else
> + /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
> + compare. */
> + .thresh = V4 (0x1d5ebc40),
> +#endif
> +};
> +
> +/* asuint(0x1p-23), shifted by 1 for abs compare. */
> +#define TinyBound v_u32 (0x34000000 << 1)
> +
> +static float32x4_t VPCS_ATTR NOINLINE
> +special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
> +{
> + return v_call_f32 (expm1f, x, y, special);
> +}
> +
> +/* Single-precision vector exp(x) - 1 function.
> + The maximum error is 1.51 ULP:
> + _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
> + want 0x1.e2fb94p-2. */
> +float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> + uint32x4_t ix = vreinterpretq_u32_f32 (x);
> +
> +#if WANT_SIMD_EXCEPT
> + /* If fp exceptions are to be triggered correctly, fall back to scalar for
> + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
> + shift-left by 1, and compare with thresh which was left-shifted offline -
> + this is effectively an absolute compare. */
> + uint32x4_t special
> + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
> + if (__glibc_unlikely (v_any_u32 (special)))
> + x = v_zerofy_f32 (x, special);
> +#else
> + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
> + uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound));
> +#endif
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift);
> + int32x4_t i = vcvtq_s32_f32 (j);
> + float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi);
> + f = vfmsq_f32 (f, j, d->ln2_lo);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + float32x4_t p = v_horner_4_f32 (f, d->poly);
> + p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
> + float32x4_t t = vreinterpretq_f32_s32 (u);
> +
> + if (__glibc_unlikely (v_any_u32 (special)))
> + return special_case (vreinterpretq_f32_u32 (ix),
> + vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
> + special);
> +
> + /* expm1(x) ~= p * t + (t - 1). */
> + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
> +}
> diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
> new file mode 100644
> index 0000000000..96e579e5b7
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1f_sve.c
> @@ -0,0 +1,99 @@
> +/* Single-precision SVE expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f32.h"
> +
> +/* Largest value of x for which expm1(x) should round to -1. */
> +#define SpecialBound 0x1.5ebc4p+6f
> +
> +static const struct data
> +{
> + /* These 4 are grouped together so they can be loaded as one quadword, then
> + used with _lane forms of svmla/svmls. */
> + float c2, c4, ln2_hi, ln2_lo;
> + float c0, c1, c3, inv_ln2, special_bound, shift;
> +} data = {
> + /* Generated using fpminimax. */
> + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
> + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
> + .c4 = 0x1.6b55a2p-10,
> +
> + .special_bound = SpecialBound, .shift = 0x1.8p23f,
> + .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
> + .ln2_lo = 0x1.7f7d1cp-20f,
> +};
> +
> +#define C(i) sv_f32 (d->c##i)
> +
> +static svfloat32_t NOINLINE
> +special_case (svfloat32_t x, svbool_t pg)
> +{
> + return sv_call_f32 (expm1f, x, x, pg);
> +}
> +
> +/* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP:
> + _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2
> + want 0x1.e859d4p-2. */
> +svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* Large, NaN/Inf. */
> + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
> +
> + if (__glibc_unlikely (svptest_any (pg, special)))
> + return special_case (x, pg);
> +
> + /* This vector is reliant on layout of data - it contains constants
> + that can be used with _lane forms of svmla/svmls. Values are:
> + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
> + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
> + j = svsub_x (pg, j, d->shift);
> + svint32_t i = svcvt_s32_x (pg, j);
> +
> + svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
> + f = svmls_lane (f, j, lane_constants, 3);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
> + svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
> + svfloat32_t f2 = svmul_x (pg, f, f);
> + svfloat32_t p = svmla_x (pg, p12, f2, p34);
> + p = svmla_x (pg, C (0), f, p);
> + p = svmla_x (pg, f, f2, p);
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + svfloat32_t t = svreinterpret_f32 (
> + svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
> + return svmla_x (pg, svsub_x (pg, t, 1), p, t);
> +}
> diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> index fc9e7aec47..bf495450d7 100644
> --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> @@ -31,6 +31,7 @@ VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
> VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
> VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
> VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
> +VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
> VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
> VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
> VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
> diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> index aea589d5fb..b5a627ad47 100644
> --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> @@ -50,6 +50,7 @@ SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
> SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
> SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
> SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
> +SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
> SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
> SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
> SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
> diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> index 446fd7f538..26d9e98739 100644
> --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> @@ -31,6 +31,7 @@ VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
> VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
> VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
> VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
> +VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
> VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
> VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
> VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
> diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> index ac17f60856..f286ee64c9 100644
> --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> @@ -50,6 +50,7 @@ SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
> SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
> SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
> SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
> +SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
> SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
> SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
> SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> index a6b2f29a6f..1d52bf9ebf 100644
> --- a/sysdeps/aarch64/libm-test-ulps
> +++ b/sysdeps/aarch64/libm-test-ulps
> @@ -1078,11 +1078,19 @@ double: 1
> float: 1
> ldouble: 2
>
> +Function: "expm1_advsimd":
> +double: 2
> +float: 1
> +
> Function: "expm1_downward":
> double: 1
> float: 1
> ldouble: 2
>
> +Function: "expm1_sve":
> +double: 2
> +float: 1
> +
> Function: "expm1_towardzero":
> double: 1
> float: 2
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index 0f20b5be29..2bf4ea6332 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -19,6 +19,7 @@ GLIBC_2.39 _ZGVnN2v_asin F
> GLIBC_2.39 _ZGVnN2v_atan F
> GLIBC_2.39 _ZGVnN2v_exp10 F
> GLIBC_2.39 _ZGVnN2v_exp2 F
> +GLIBC_2.39 _ZGVnN2v_expm1 F
> GLIBC_2.39 _ZGVnN2v_log10 F
> GLIBC_2.39 _ZGVnN2v_log1p F
> GLIBC_2.39 _ZGVnN2v_log2 F
> @@ -29,6 +30,7 @@ GLIBC_2.39 _ZGVnN4v_asinf F
> GLIBC_2.39 _ZGVnN4v_atanf F
> GLIBC_2.39 _ZGVnN4v_exp10f F
> GLIBC_2.39 _ZGVnN4v_exp2f F
> +GLIBC_2.39 _ZGVnN4v_expm1f F
> GLIBC_2.39 _ZGVnN4v_log10f F
> GLIBC_2.39 _ZGVnN4v_log1pf F
> GLIBC_2.39 _ZGVnN4v_log2f F
> @@ -44,6 +46,8 @@ GLIBC_2.39 _ZGVsMxv_exp10 F
> GLIBC_2.39 _ZGVsMxv_exp10f F
> GLIBC_2.39 _ZGVsMxv_exp2 F
> GLIBC_2.39 _ZGVsMxv_exp2f F
> +GLIBC_2.39 _ZGVsMxv_expm1 F
> +GLIBC_2.39 _ZGVsMxv_expm1f F
> GLIBC_2.39 _ZGVsMxv_log10 F
> GLIBC_2.39 _ZGVsMxv_log10f F
> GLIBC_2.39 _ZGVsMxv_log1p F
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/6] aarch64: Add vector implementations of asin routines
2023-11-03 12:12 [PATCH 1/6] aarch64: Add vector implementations of asin routines Joe Ramsay
` (4 preceding siblings ...)
2023-11-03 12:12 ` [PATCH 6/6] aarch64: Add vector implementations of expm1 routines Joe Ramsay
@ 2023-11-10 17:51 ` Szabolcs Nagy
5 siblings, 0 replies; 14+ messages in thread
From: Szabolcs Nagy @ 2023-11-10 17:51 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
The 11/03/2023 12:12, Joe Ramsay wrote:
> ---
this is OK to commit.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Thanks,
> Joe
> sysdeps/aarch64/fpu/Makefile | 3 +-
> sysdeps/aarch64/fpu/Versions | 4 +
> sysdeps/aarch64/fpu/asin_advsimd.c | 113 ++++++++++++++++++
> sysdeps/aarch64/fpu/asin_sve.c | 86 +++++++++++++
> sysdeps/aarch64/fpu/asinf_advsimd.c | 104 ++++++++++++++++
> sysdeps/aarch64/fpu/asinf_sve.c | 78 ++++++++++++
> sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
> .../fpu/test-double-advsimd-wrappers.c | 1 +
> .../aarch64/fpu/test-double-sve-wrappers.c | 1 +
> .../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
> sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
> sysdeps/aarch64/libm-test-ulps | 8 ++
> .../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
> 13 files changed, 407 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/aarch64/fpu/asin_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/asin_sve.c
> create mode 100644 sysdeps/aarch64/fpu/asinf_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/asinf_sve.c
>
> diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
> index 1f1ac2a2b8..d7c0bd2ed5 100644
> --- a/sysdeps/aarch64/fpu/Makefile
> +++ b/sysdeps/aarch64/fpu/Makefile
> @@ -1,4 +1,5 @@
> -libmvec-supported-funcs = cos \
> +libmvec-supported-funcs = asin \
> + cos \
> exp \
> exp10 \
> exp2 \
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index eb5ad50017..0f365a1e2e 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -18,6 +18,10 @@ libmvec {
> _ZGVsMxv_sinf;
> }
> GLIBC_2.39 {
> + _ZGVnN4v_asinf;
> + _ZGVnN2v_asin;
> + _ZGVsMxv_asinf;
> + _ZGVsMxv_asin;
> _ZGVnN4v_exp10f;
> _ZGVnN2v_exp10;
> _ZGVsMxv_exp10f;
> diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c
> new file mode 100644
> index 0000000000..d2adbc0d87
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/asin_advsimd.c
> @@ -0,0 +1,113 @@
> +/* Double-precision AdvSIMD inverse sin
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f64.h"
> +
> +static const struct data
> +{
> + float64x2_t poly[12];
> + float64x2_t pi_over_2;
> + uint64x2_t abs_mask;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
> + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
> + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
> + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
> + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
> + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
> + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
> + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
> + .pi_over_2 = V2 (0x1.921fb54442d18p+0),
> + .abs_mask = V2 (0x7fffffffffffffff),
> +};
> +
> +#define AllMask v_u64 (0xffffffffffffffff)
> +#define One (0x3ff0000000000000)
> +#define Small (0x3e50000000000000) /* 2^-12. */
> +
> +#if WANT_SIMD_EXCEPT
> +static float64x2_t VPCS_ATTR NOINLINE
> +special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
> +{
> + return v_call_f64 (asin, x, y, special);
> +}
> +#endif
> +
> +/* Double-precision implementation of vector asin(x).
> +
> + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
> + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
> + following approximation.
> +
> + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
> + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
> +
> + The largest observed error in this region is 1.01 ulps,
> + _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
> + want 0x1.ed78525a927eep-2.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 2.69 ulps,
> + _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
> + want 0x1.110d7e85fdd53p-1. */
> +float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + float64x2_t ax = vabsq_f64 (x);
> +
> +#if WANT_SIMD_EXCEPT
> + /* Special values need to be computed with scalar fallbacks so
> + that appropriate exceptions are raised. */
> + uint64x2_t special
> + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
> + v_u64 (One - Small));
> + if (__glibc_unlikely (v_any_u64 (special)))
> + return special_case (x, x, AllMask);
> +#endif
> +
> + uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
> +
> + /* Evaluate polynomial Q(x) = y + y * z * P(z) with
> + z = x ^ 2 and y = |x| , if |x| < 0.5
> + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
> + float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x),
> + vfmsq_n_f64 (v_f64 (0.5), ax, 0.5));
> + float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2));
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + float64x2_t z4 = vmulq_f64 (z2, z2);
> + float64x2_t z8 = vmulq_f64 (z4, z4);
> + float64x2_t z16 = vmulq_f64 (z8, z8);
> + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
> +
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
> +
> + /* asin(|x|) = Q(|x|) , for |x| < 0.5
> + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
> + float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0));
> +
> + /* Copy sign. */
> + return vbslq_f64 (d->abs_mask, y, x);
> +}
> diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c
> new file mode 100644
> index 0000000000..fa04d7fca6
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/asin_sve.c
> @@ -0,0 +1,86 @@
> +/* Double-precision SVE inverse sin
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f64.h"
> +
> +static const struct data
> +{
> + float64_t poly[12];
> + float64_t pi_over_2f;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
> + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
> + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4,
> + 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6,
> + 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
> + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7,
> + 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6,
> + -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
> + .pi_over_2f = 0x1.921fb54442d18p+0,
> +};
> +
> +#define P(i) sv_f64 (d->poly[i])
> +
> +/* Double-precision SVE implementation of vector asin(x).
> +
> + For |x| in [0, 0.5], use an order 11 polynomial P such that the final
> + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
> +
> + The largest observed error in this region is 0.52 ulps,
> + _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2
> + want 0x1.ec13757305f26p-2.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 2.69 ulps,
> + _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
> + want 0x1.110d7e85fdd53p-1. */
> +svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
> + svfloat64_t ax = svabs_x (pg, x);
> + svbool_t a_ge_half = svacge (pg, x, 0.5);
> +
> + /* Evaluate polynomial Q(x) = y + y * z * P(z) with
> + z = x ^ 2 and y = |x| , if |x| < 0.5
> + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
> + svfloat64_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
> + svmul_x (pg, x, x));
> + svfloat64_t z = svsqrt_m (ax, a_ge_half, z2);
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + svfloat64_t z4 = svmul_x (pg, z2, z2);
> + svfloat64_t z8 = svmul_x (pg, z4, z4);
> + svfloat64_t z16 = svmul_x (pg, z8, z8);
> + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
> +
> + /* asin(|x|) = Q(|x|) , for |x| < 0.5
> + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
> + svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f);
> +
> + /* Copy sign. */
> + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
> +}
> diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c
> new file mode 100644
> index 0000000000..3180ae7c8e
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
> @@ -0,0 +1,104 @@
> +/* Single-precision AdvSIMD inverse sin
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f32.h"
> +
> +static const struct data
> +{
> + float32x4_t poly[5];
> + float32x4_t pi_over_2f;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
> + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
> + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
> + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
> + .pi_over_2f = V4 (0x1.921fb6p+0f),
> +};
> +
> +#define AbsMask 0x7fffffff
> +#define Half 0x3f000000
> +#define One 0x3f800000
> +#define Small 0x39800000 /* 2^-12. */
> +
> +#if WANT_SIMD_EXCEPT
> +static float32x4_t VPCS_ATTR NOINLINE
> +special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
> +{
> + return v_call_f32 (asinf, x, y, special);
> +}
> +#endif
> +
> +/* Single-precision implementation of vector asin(x).
> +
> + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
> + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
> + following approximation.
> +
> + For |x| in [Small, 0.5], use order 4 polynomial P such that the final
> + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
> +
> + The largest observed error in this region is 0.83 ulps,
> + _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 2.41 ulps,
> + _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
> +float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + uint32x4_t ix = vreinterpretq_u32_f32 (x);
> + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
> +
> +#if WANT_SIMD_EXCEPT
> + /* Special values need to be computed with scalar fallbacks so
> + that appropriate fp exceptions are raised. */
> + uint32x4_t special
> + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
> + if (__glibc_unlikely (v_any_u32 (special)))
> + return special_case (x, x, v_u32 (0xffffffff));
> +#endif
> +
> + float32x4_t ax = vreinterpretq_f32_u32 (ia);
> + uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
> +
> + /* Evaluate polynomial Q(x) = y + y * z * P(z) with
> + z = x ^ 2 and y = |x| , if |x| < 0.5
> + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
> + float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
> + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
> + float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + float32x4_t p = v_horner_4_f32 (z2, d->poly);
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
> +
> + /* asin(|x|) = Q(|x|) , for |x| < 0.5
> + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
> + float32x4_t y
> + = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
> +
> + /* Copy sign. */
> + return vbslq_f32 (v_u32 (AbsMask), y, x);
> +}
> diff --git a/sysdeps/aarch64/fpu/asinf_sve.c b/sysdeps/aarch64/fpu/asinf_sve.c
> new file mode 100644
> index 0000000000..5abe710b5a
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/asinf_sve.c
> @@ -0,0 +1,78 @@
> +/* Single-precision SVE inverse sin
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f32.h"
> +
> +static const struct data
> +{
> + float32_t poly[5];
> + float32_t pi_over_2f;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
> + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
> + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6,
> + 0x1.3af7d8p-5, },
> + .pi_over_2f = 0x1.921fb6p+0f,
> +};
> +
> +/* Single-precision SVE implementation of vector asin(x).
> +
> + For |x| in [0, 0.5], use order 4 polynomial P such that the final
> + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
> +
> + The largest observed error in this region is 0.83 ulps,
> + _ZGVsMxv_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2
> + want 0x1.fef15cp-2.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 2.41 ulps,
> + _ZGVsMxv_asinf (-0x1.00203ep-1) got -0x1.0c3a64p-1
> + want -0x1.0c3a6p-1. */
> +svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
> +
> + svfloat32_t ax = svabs_x (pg, x);
> + svbool_t a_ge_half = svacge (pg, x, 0.5);
> +
> + /* Evaluate polynomial Q(x) = y + y * z * P(z) with
> + z = x ^ 2 and y = |x| , if |x| < 0.5
> + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
> + svfloat32_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5),
> + svmul_x (pg, x, x));
> + svfloat32_t z = svsqrt_m (ax, a_ge_half, z2);
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly);
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
> +
> + /* asin(|x|) = Q(|x|) , for |x| < 0.5
> + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
> + svfloat32_t y = svmad_m (a_ge_half, p, sv_f32 (-2.0), d->pi_over_2f);
> +
> + /* Copy sign. */
> + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
> +}
> diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
> index 06587ffa91..03778faf96 100644
> --- a/sysdeps/aarch64/fpu/bits/math-vector.h
> +++ b/sysdeps/aarch64/fpu/bits/math-vector.h
> @@ -49,6 +49,7 @@ typedef __SVBool_t __sv_bool_t;
>
> # define __vpcs __attribute__ ((__aarch64_vector_pcs__))
>
> +__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
> @@ -59,6 +60,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
>
> +__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
> @@ -74,6 +76,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
>
> #ifdef __SVE_VEC_MATH_SUPPORTED
>
> +__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
> @@ -84,6 +87,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
>
> +__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
> diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> index 26d5ecf66f..b5ccd6b1cc 100644
> --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> @@ -23,6 +23,7 @@
>
> #define VEC_TYPE float64x2_t
>
> +VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
> VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
> VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
> VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
> diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> index 86efd60779..fc3b20f421 100644
> --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> @@ -32,6 +32,7 @@
> return svlastb_f64 (svptrue_b64 (), mr); \
> }
>
> +SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
> SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
> SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
> SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
> diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> index 8f7ebea1ac..0a36aa91f5 100644
> --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> @@ -23,6 +23,7 @@
>
> #define VEC_TYPE float32x4_t
>
> +VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
> VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
> VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
> VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
> diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> index 885e58ac39..f7e4882c7a 100644
> --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> @@ -32,6 +32,7 @@
> return svlastb_f32 (svptrue_b32 (), mr); \
> }
>
> +SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
> SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
> SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
> SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> index d117209c06..1edc0fc343 100644
> --- a/sysdeps/aarch64/libm-test-ulps
> +++ b/sysdeps/aarch64/libm-test-ulps
> @@ -46,11 +46,19 @@ double: 1
> float: 1
> ldouble: 1
>
> +Function: "asin_advsimd":
> +double: 2
> +float: 2
> +
> Function: "asin_downward":
> double: 1
> float: 1
> ldouble: 2
>
> +Function: "asin_sve":
> +double: 2
> +float: 2
> +
> Function: "asin_towardzero":
> double: 1
> float: 1
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index cad774521a..6431c3fe65 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -14,16 +14,20 @@ GLIBC_2.38 _ZGVsMxv_log F
> GLIBC_2.38 _ZGVsMxv_logf F
> GLIBC_2.38 _ZGVsMxv_sin F
> GLIBC_2.38 _ZGVsMxv_sinf F
> +GLIBC_2.39 _ZGVnN2v_asin F
> GLIBC_2.39 _ZGVnN2v_exp10 F
> GLIBC_2.39 _ZGVnN2v_exp2 F
> GLIBC_2.39 _ZGVnN2v_log10 F
> GLIBC_2.39 _ZGVnN2v_log2 F
> GLIBC_2.39 _ZGVnN2v_tan F
> +GLIBC_2.39 _ZGVnN4v_asinf F
> GLIBC_2.39 _ZGVnN4v_exp10f F
> GLIBC_2.39 _ZGVnN4v_exp2f F
> GLIBC_2.39 _ZGVnN4v_log10f F
> GLIBC_2.39 _ZGVnN4v_log2f F
> GLIBC_2.39 _ZGVnN4v_tanf F
> +GLIBC_2.39 _ZGVsMxv_asin F
> +GLIBC_2.39 _ZGVsMxv_asinf F
> GLIBC_2.39 _ZGVsMxv_exp10 F
> GLIBC_2.39 _ZGVsMxv_exp10f F
> GLIBC_2.39 _ZGVsMxv_exp2 F
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/6] aarch64: Add vector implementations of acos routines
2023-11-03 12:12 ` [PATCH 2/6] aarch64: Add vector implementations of acos routines Joe Ramsay
@ 2023-11-10 17:53 ` Szabolcs Nagy
0 siblings, 0 replies; 14+ messages in thread
From: Szabolcs Nagy @ 2023-11-10 17:53 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
The 11/03/2023 12:12, Joe Ramsay wrote:
> ---
this is OK to commit.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Thanks,
> Joe
> sysdeps/aarch64/fpu/Makefile | 3 +-
> sysdeps/aarch64/fpu/Versions | 4 +
> sysdeps/aarch64/fpu/acos_advsimd.c | 122 ++++++++++++++++++
> sysdeps/aarch64/fpu/acos_sve.c | 93 +++++++++++++
> sysdeps/aarch64/fpu/acosf_advsimd.c | 113 ++++++++++++++++
> sysdeps/aarch64/fpu/acosf_sve.c | 86 ++++++++++++
> sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
> .../fpu/test-double-advsimd-wrappers.c | 1 +
> .../aarch64/fpu/test-double-sve-wrappers.c | 1 +
> .../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
> sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
> sysdeps/aarch64/libm-test-ulps | 8 ++
> .../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
> 13 files changed, 440 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/aarch64/fpu/acos_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/acos_sve.c
> create mode 100644 sysdeps/aarch64/fpu/acosf_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/acosf_sve.c
>
> diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
> index d7c0bd2ed5..606fdd804f 100644
> --- a/sysdeps/aarch64/fpu/Makefile
> +++ b/sysdeps/aarch64/fpu/Makefile
> @@ -1,4 +1,5 @@
> -libmvec-supported-funcs = asin \
> +libmvec-supported-funcs = acos \
> + asin \
> cos \
> exp \
> exp10 \
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index 0f365a1e2e..1037cd92bd 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -18,6 +18,10 @@ libmvec {
> _ZGVsMxv_sinf;
> }
> GLIBC_2.39 {
> + _ZGVnN4v_acosf;
> + _ZGVnN2v_acos;
> + _ZGVsMxv_acosf;
> + _ZGVsMxv_acos;
> _ZGVnN4v_asinf;
> _ZGVnN2v_asin;
> _ZGVsMxv_asinf;
> diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c
> new file mode 100644
> index 0000000000..3121cf66b1
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/acos_advsimd.c
> @@ -0,0 +1,122 @@
> +/* Double-precision AdvSIMD inverse cos
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f64.h"
> +
> +static const struct data
> +{
> + float64x2_t poly[12];
> + float64x2_t pi, pi_over_2;
> + uint64x2_t abs_mask;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
> + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
> + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
> + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
> + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
> + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
> + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
> + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
> + .pi = V2 (0x1.921fb54442d18p+1),
> + .pi_over_2 = V2 (0x1.921fb54442d18p+0),
> + .abs_mask = V2 (0x7fffffffffffffff),
> +};
> +
> +#define AllMask v_u64 (0xffffffffffffffff)
> +#define Oneu (0x3ff0000000000000)
> +#define Small (0x3e50000000000000) /* 2^-53. */
> +
> +#if WANT_SIMD_EXCEPT
> +static float64x2_t VPCS_ATTR NOINLINE
> +special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
> +{
> + return v_call_f64 (acos, x, y, special);
> +}
> +#endif
> +
> +/* Double-precision implementation of vector acos(x).
> +
> + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
> + rounding.
> + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
> + approximation.
> +
> + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
> + approximation of asin is an odd polynomial:
> +
> + acos(x) ~ pi/2 - (x + x^3 P(x^2)).
> +
> + The largest observed error in this region is 1.18 ulps,
> + _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
> + want 0x1.0d54d1985c069p+0.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 1.52 ulps,
> + _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
> + want 0x1.edbbedf8a7d6cp-1. */
> +float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + float64x2_t ax = vabsq_f64 (x);
> +
> +#if WANT_SIMD_EXCEPT
> + /* A single comparison for One, Small and QNaN. */
> + uint64x2_t special
> + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
> + v_u64 (Oneu - Small));
> + if (__glibc_unlikely (v_any_u64 (special)))
> + return special_case (x, x, AllMask);
> +#endif
> +
> + uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5));
> +
> + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
> + z2 = x ^ 2 and z = |x| , if |x| < 0.5
> + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
> + float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x),
> + vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax));
> + float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + float64x2_t z4 = vmulq_f64 (z2, z2);
> + float64x2_t z8 = vmulq_f64 (z4, z4);
> + float64x2_t z16 = vmulq_f64 (z8, z8);
> + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
> +
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
> +
> + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
> + = 2 Q(|x|) , for 0.5 < x < 1.0
> + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
> + float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
> +
> + uint64x2_t is_neg = vcltzq_f64 (x);
> + float64x2_t off = vreinterpretq_f64_u64 (
> + vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi)));
> + float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0));
> + float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off);
> +
> + return vfmaq_f64 (add, mul, y);
> +}
> diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c
> new file mode 100644
> index 0000000000..1138a04e73
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/acos_sve.c
> @@ -0,0 +1,93 @@
> +/* Double-precision SVE inverse cos
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f64.h"
> +
> +static const struct data
> +{
> + float64_t poly[12];
> + float64_t pi, pi_over_2;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
> + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
> + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
> + 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
> + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
> + 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
> + .pi = 0x1.921fb54442d18p+1,
> + .pi_over_2 = 0x1.921fb54442d18p+0,
> +};
> +
> +/* Double-precision SVE implementation of vector acos(x).
> +
> + For |x| in [0, 0.5], use an order 11 polynomial P such that the final
> + approximation of asin is an odd polynomial:
> +
> + acos(x) ~ pi/2 - (x + x^3 P(x^2)).
> +
> + The largest observed error in this region is 1.18 ulps,
> + _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0
> + want 0x1.0d4d0f55667f7p+0.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 1.52 ulps,
> + _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1
> + want 0x1.ed82df4243f0bp-1. */
> +svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
> + svfloat64_t ax = svabs_x (pg, x);
> +
> + svbool_t a_gt_half = svacgt (pg, x, 0.5);
> +
> + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
> + z2 = x ^ 2 and z = |x| , if |x| < 0.5
> + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
> + svfloat64_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
> + svmul_x (pg, x, x));
> + svfloat64_t z = svsqrt_m (ax, a_gt_half, z2);
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + svfloat64_t z4 = svmul_x (pg, z2, z2);
> + svfloat64_t z8 = svmul_x (pg, z4, z4);
> + svfloat64_t z16 = svmul_x (pg, z8, z8);
> + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
> +
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
> +
> + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
> + = 2 Q(|x|) , for 0.5 < x < 1.0
> + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
> + svfloat64_t y
> + = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign));
> +
> + svbool_t is_neg = svcmplt (pg, x, 0.0);
> + svfloat64_t off = svdup_f64_z (is_neg, d->pi);
> + svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0));
> + svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2));
> +
> + return svmla_x (pg, add, mul, y);
> +}
> diff --git a/sysdeps/aarch64/fpu/acosf_advsimd.c b/sysdeps/aarch64/fpu/acosf_advsimd.c
> new file mode 100644
> index 0000000000..7d39e9b805
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/acosf_advsimd.c
> @@ -0,0 +1,113 @@
> +/* Single-precision AdvSIMD inverse cos
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f32.h"
> +
> +static const struct data
> +{
> + float32x4_t poly[5];
> + float32x4_t pi_over_2f, pif;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
> + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
> + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
> + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
> + .pi_over_2f = V4 (0x1.921fb6p+0f),
> + .pif = V4 (0x1.921fb6p+1f),
> +};
> +
> +#define AbsMask 0x7fffffff
> +#define Half 0x3f000000
> +#define One 0x3f800000
> +#define Small 0x32800000 /* 2^-26. */
> +
> +#if WANT_SIMD_EXCEPT
> +static float32x4_t VPCS_ATTR NOINLINE
> +special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
> +{
> + return v_call_f32 (acosf, x, y, special);
> +}
> +#endif
> +
> +/* Single-precision implementation of vector acos(x).
> +
> + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
> + rounding.
> + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
> + approximation.
> +
> + For |x| in [Small, 0.5], use order 4 polynomial P such that the final
> + approximation of asin is an odd polynomial:
> +
> + acos(x) ~ pi/2 - (x + x^3 P(x^2)).
> +
> + The largest observed error in this region is 1.26 ulps,
> + _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 1.32 ulps,
> + _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
> + want 0x1.feb32ep-1. */
> +float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + uint32x4_t ix = vreinterpretq_u32_f32 (x);
> + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
> +
> +#if WANT_SIMD_EXCEPT
> + /* A single comparison for One, Small and QNaN. */
> + uint32x4_t special
> + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
> + if (__glibc_unlikely (v_any_u32 (special)))
> + return special_case (x, x, v_u32 (0xffffffff));
> +#endif
> +
> + float32x4_t ax = vreinterpretq_f32_u32 (ia);
> + uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
> +
> + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
> + z2 = x ^ 2 and z = |x| , if |x| < 0.5
> + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
> + float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x),
> + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
> + float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2));
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + float32x4_t p = v_horner_4_f32 (z2, d->poly);
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
> +
> + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
> + = 2 Q(|x|) , for 0.5 < x < 1.0
> + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
> + float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x);
> +
> + uint32x4_t is_neg = vcltzq_f32 (x);
> + float32x4_t off = vreinterpretq_f32_u32 (
> + vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg));
> + float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0));
> + float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off);
> +
> + return vfmaq_f32 (add, mul, y);
> +}
> diff --git a/sysdeps/aarch64/fpu/acosf_sve.c b/sysdeps/aarch64/fpu/acosf_sve.c
> new file mode 100644
> index 0000000000..44253fa999
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/acosf_sve.c
> @@ -0,0 +1,86 @@
> +/* Single-precision SVE inverse cos
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f32.h"
> +
> +static const struct data
> +{
> + float32_t poly[5];
> + float32_t pi, pi_over_2;
> +} data = {
> + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
> + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
> + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6,
> + 0x1.3af7d8p-5, },
> + .pi = 0x1.921fb6p+1f,
> + .pi_over_2 = 0x1.921fb6p+0f,
> +};
> +
> +/* Single-precision SVE implementation of vector acos(x).
> +
> + For |x| in [0, 0.5], use order 4 polynomial P such that the final
> + approximation of asin is an odd polynomial:
> +
> + acos(x) ~ pi/2 - (x + x^3 P(x^2)).
> +
> + The largest observed error in this region is 1.16 ulps,
> + _ZGVsMxv_acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0
> + want 0x1.0c27f6p+0.
> +
> + For |x| in [0.5, 1.0], use same approximation with a change of variable
> +
> + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
> +
> + The largest observed error in this region is 1.32 ulps,
> + _ZGVsMxv_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
> + want 0x1.feb32ep-1. */
> +svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
> + svfloat32_t ax = svabs_x (pg, x);
> + svbool_t a_gt_half = svacgt (pg, x, 0.5);
> +
> + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
> + z2 = x ^ 2 and z = |x| , if |x| < 0.5
> + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
> + svfloat32_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5),
> + svmul_x (pg, x, x));
> + svfloat32_t z = svsqrt_m (ax, a_gt_half, z2);
> +
> + /* Use a single polynomial approximation P for both intervals. */
> + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly);
> + /* Finalize polynomial: z + z * z2 * P(z2). */
> + p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
> +
> + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
> + = 2 Q(|x|) , for 0.5 < x < 1.0
> + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
> + svfloat32_t y
> + = svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (p), sign));
> +
> + svbool_t is_neg = svcmplt (pg, x, 0.0);
> + svfloat32_t off = svdup_f32_z (is_neg, d->pi);
> + svfloat32_t mul = svsel (a_gt_half, sv_f32 (2.0), sv_f32 (-1.0));
> + svfloat32_t add = svsel (a_gt_half, off, sv_f32 (d->pi_over_2));
> +
> + return svmla_x (pg, add, mul, y);
> +}
> diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
> index 03778faf96..f313993d70 100644
> --- a/sysdeps/aarch64/fpu/bits/math-vector.h
> +++ b/sysdeps/aarch64/fpu/bits/math-vector.h
> @@ -49,6 +49,7 @@ typedef __SVBool_t __sv_bool_t;
>
> # define __vpcs __attribute__ ((__aarch64_vector_pcs__))
>
> +__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
> @@ -60,6 +61,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
>
> +__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
> @@ -76,6 +78,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
>
> #ifdef __SVE_VEC_MATH_SUPPORTED
>
> +__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
> @@ -87,6 +90,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
>
> +__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
> diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> index b5ccd6b1cc..5a0cbf743b 100644
> --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> @@ -23,6 +23,7 @@
>
> #define VEC_TYPE float64x2_t
>
> +VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
> VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
> VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
> VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
> diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> index fc3b20f421..bd89ff6133 100644
> --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> @@ -32,6 +32,7 @@
> return svlastb_f64 (svptrue_b64 (), mr); \
> }
>
> +SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
> SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
> SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
> SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
> diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> index 0a36aa91f5..3fafca7557 100644
> --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> @@ -23,6 +23,7 @@
>
> #define VEC_TYPE float32x4_t
>
> +VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
> VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
> VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
> VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
> diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> index f7e4882c7a..b4ec9f777b 100644
> --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> @@ -32,6 +32,7 @@
> return svlastb_f32 (svptrue_b32 (), mr); \
> }
>
> +SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
> SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
> SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
> SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> index 1edc0fc343..c2b6f21b9d 100644
> --- a/sysdeps/aarch64/libm-test-ulps
> +++ b/sysdeps/aarch64/libm-test-ulps
> @@ -6,11 +6,19 @@ double: 1
> float: 1
> ldouble: 1
>
> +Function: "acos_advsimd":
> +double: 1
> +float: 1
> +
> Function: "acos_downward":
> double: 1
> float: 1
> ldouble: 1
>
> +Function: "acos_sve":
> +double: 1
> +float: 1
> +
> Function: "acos_towardzero":
> double: 1
> float: 1
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index 6431c3fe65..f79eaaf241 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -14,18 +14,22 @@ GLIBC_2.38 _ZGVsMxv_log F
> GLIBC_2.38 _ZGVsMxv_logf F
> GLIBC_2.38 _ZGVsMxv_sin F
> GLIBC_2.38 _ZGVsMxv_sinf F
> +GLIBC_2.39 _ZGVnN2v_acos F
> GLIBC_2.39 _ZGVnN2v_asin F
> GLIBC_2.39 _ZGVnN2v_exp10 F
> GLIBC_2.39 _ZGVnN2v_exp2 F
> GLIBC_2.39 _ZGVnN2v_log10 F
> GLIBC_2.39 _ZGVnN2v_log2 F
> GLIBC_2.39 _ZGVnN2v_tan F
> +GLIBC_2.39 _ZGVnN4v_acosf F
> GLIBC_2.39 _ZGVnN4v_asinf F
> GLIBC_2.39 _ZGVnN4v_exp10f F
> GLIBC_2.39 _ZGVnN4v_exp2f F
> GLIBC_2.39 _ZGVnN4v_log10f F
> GLIBC_2.39 _ZGVnN4v_log2f F
> GLIBC_2.39 _ZGVnN4v_tanf F
> +GLIBC_2.39 _ZGVsMxv_acos F
> +GLIBC_2.39 _ZGVsMxv_acosf F
> GLIBC_2.39 _ZGVsMxv_asin F
> GLIBC_2.39 _ZGVsMxv_asinf F
> GLIBC_2.39 _ZGVsMxv_exp10 F
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 3/6] aarch64: Add vector implementations of atan routines
2023-11-03 12:12 ` [PATCH 3/6] aarch64: Add vector implementations of atan routines Joe Ramsay
@ 2023-11-10 17:54 ` Szabolcs Nagy
0 siblings, 0 replies; 14+ messages in thread
From: Szabolcs Nagy @ 2023-11-10 17:54 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
The 11/03/2023 12:12, Joe Ramsay wrote:
> ---
this is OK to commit.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Thanks,
> Joe
> sysdeps/aarch64/fpu/Makefile | 1 +
> sysdeps/aarch64/fpu/Versions | 4 +
> sysdeps/aarch64/fpu/atan_advsimd.c | 104 +++++++++++++++++
> sysdeps/aarch64/fpu/atan_sve.c | 90 +++++++++++++++
> sysdeps/aarch64/fpu/atanf_advsimd.c | 109 ++++++++++++++++++
> sysdeps/aarch64/fpu/atanf_sve.c | 79 +++++++++++++
> sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
> .../fpu/test-double-advsimd-wrappers.c | 1 +
> .../aarch64/fpu/test-double-sve-wrappers.c | 1 +
> .../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
> sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
> sysdeps/aarch64/libm-test-ulps | 8 ++
> .../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
> 13 files changed, 407 insertions(+)
> create mode 100644 sysdeps/aarch64/fpu/atan_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/atan_sve.c
> create mode 100644 sysdeps/aarch64/fpu/atanf_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/atanf_sve.c
>
> diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
> index 606fdd804f..5bd77a749d 100644
> --- a/sysdeps/aarch64/fpu/Makefile
> +++ b/sysdeps/aarch64/fpu/Makefile
> @@ -1,5 +1,6 @@
> libmvec-supported-funcs = acos \
> asin \
> + atan \
> cos \
> exp \
> exp10 \
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index 1037cd92bd..dfc3d2dad3 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -26,6 +26,10 @@ libmvec {
> _ZGVnN2v_asin;
> _ZGVsMxv_asinf;
> _ZGVsMxv_asin;
> + _ZGVnN4v_atanf;
> + _ZGVnN2v_atan;
> + _ZGVsMxv_atanf;
> + _ZGVsMxv_atan;
> _ZGVnN4v_exp10f;
> _ZGVnN2v_exp10;
> _ZGVsMxv_exp10f;
> diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
> new file mode 100644
> index 0000000000..d52c07d8a0
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atan_advsimd.c
> @@ -0,0 +1,104 @@
> +/* Double-precision AdvSIMD inverse tan
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f64.h"
> +
> +static const struct data
> +{
> + float64x2_t pi_over_2;
> + float64x2_t poly[20];
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + [2**-1022, 1.0]. */
> + .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
> + V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
> + V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
> + V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
> + V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
> + V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
> + V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
> + V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
> + V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
> + V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
> + .pi_over_2 = V2 (0x1.921fb54442d18p+0),
> +};
> +
> +#define SignMask v_u64 (0x8000000000000000)
> +#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */
> +#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */
> +
> +/* Fast implementation of vector atan.
> + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
> + z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
> + _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
> + want 0x1.9225645bdd7c3p-1. */
> +float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* Small cases, infs and nans are supported by our approximation technique,
> + but do not set fenv flags correctly. Only trigger special case if we need
> + fenv. */
> + uint64x2_t ix = vreinterpretq_u64_f64 (x);
> + uint64x2_t sign = vandq_u64 (ix, SignMask);
> +
> +#if WANT_SIMD_EXCEPT
> + uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000));
> + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)),
> + v_u64 (BigBound - TinyBound));
> + /* If any lane is special, fall back to the scalar routine for all lanes. */
> + if (__glibc_unlikely (v_any_u64 (special)))
> + return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
> +#endif
> +
> + /* Argument reduction:
> + y := arctan(x) for x < 1
> + y := pi/2 + arctan(-1/x) for x > 1
> + Hence, use z=-1/a if x>=1, otherwise z=a. */
> + uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
> + /* Avoid dependency in abs(x) in division (and comparison). */
> + float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
> + float64x2_t shift = vreinterpretq_f64_u64 (
> + vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
> + /* Use absolute value only when needed (odd powers of z). */
> + float64x2_t az = vbslq_f64 (
> + SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
> +
> + /* Calculate the polynomial approximation.
> + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
> + full scheme to avoid underflow in x^16.
> + The order 19 polynomial P approximates
> + (atan(sqrt(x))-sqrt(x))/x^(3/2). */
> + float64x2_t z2 = vmulq_f64 (z, z);
> + float64x2_t x2 = vmulq_f64 (z2, z2);
> + float64x2_t x4 = vmulq_f64 (x2, x2);
> + float64x2_t x8 = vmulq_f64 (x4, x4);
> + float64x2_t y
> + = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
> + v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
> +
> + /* Finalize. y = shift + z + z^3 * P(z^2). */
> + y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
> + y = vaddq_f64 (y, shift);
> +
> + /* y = atan(x) if x>0, -atan(-x) otherwise. */
> + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c
> new file mode 100644
> index 0000000000..35587ef212
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atan_sve.c
> @@ -0,0 +1,90 @@
> +/* Double-precision SVE inverse tan
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f64.h"
> +
> +static const struct data
> +{
> + float64_t poly[20];
> + float64_t pi_over_2;
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + [2**-1022, 1.0]. */
> + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
> + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
> + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
> + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
> + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
> + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
> + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
> + .pi_over_2 = 0x1.921fb54442d18p+0,
> +};
> +
> +/* Useful constants. */
> +#define SignMask (0x8000000000000000)
> +
> +/* Fast implementation of SVE atan.
> + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
> + z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
> + error is 2.27 ulps:
> + _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
> + want 0x1.9225645bdd7c3p-1. */
> +svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* No need to trigger special case. Small cases, infs and nans
> + are supported by our approximation technique. */
> + svuint64_t ix = svreinterpret_u64 (x);
> + svuint64_t sign = svand_x (pg, ix, SignMask);
> +
> + /* Argument reduction:
> + y := arctan(x) for x < 1
> + y := pi/2 + arctan(-1/x) for x > 1
> + Hence, use z=-1/a if x>=1, otherwise z=a. */
> + svbool_t red = svacgt (pg, x, 1.0);
> + /* Avoid dependency in abs(x) in division (and comparison). */
> + svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x);
> + /* Use absolute value only when needed (odd powers of z). */
> + svfloat64_t az = svabs_x (pg, z);
> + az = svneg_m (az, red, az);
> +
> + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
> + svfloat64_t z2 = svmul_x (pg, z, z);
> + svfloat64_t x2 = svmul_x (pg, z2, z2);
> + svfloat64_t x4 = svmul_x (pg, x2, x2);
> + svfloat64_t x8 = svmul_x (pg, x4, x4);
> +
> + svfloat64_t y
> + = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly),
> + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8);
> +
> + /* y = shift + z + z^3 * P(z^2). */
> + svfloat64_t z3 = svmul_x (pg, z2, az);
> + y = svmla_x (pg, az, z3, y);
> +
> + /* Apply shift as indicated by `red` predicate. */
> + y = svadd_m (red, y, d->pi_over_2);
> +
> + /* y = atan(x) if x>0, -atan(-x) otherwise. */
> + y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
> +
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c
> new file mode 100644
> index 0000000000..589b0e8c96
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
> @@ -0,0 +1,109 @@
> +/* Single-precision AdvSIMD inverse tan
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f32.h"
> +
> +static const struct data
> +{
> + float32x4_t poly[8];
> + float32x4_t pi_over_2;
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + [2**-128, 1.0].
> + Generated using fpminimax between FLT_MIN and 1. */
> + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
> + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
> + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
> + .pi_over_2 = V4 (0x1.921fb6p+0f),
> +};
> +
> +#define SignMask v_u32 (0x80000000)
> +
> +#define P(i) d->poly[i]
> +
> +#define TinyBound 0x30800000 /* asuint(0x1p-30). */
> +#define BigBound 0x4e800000 /* asuint(0x1p30). */
> +
> +#if WANT_SIMD_EXCEPT
> +static float32x4_t VPCS_ATTR NOINLINE
> +special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
> +{
> + return v_call_f32 (atanf, x, y, special);
> +}
> +#endif
> +
> +/* Fast implementation of vector atanf based on
> + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
> + using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
> + _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
> +float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* Small cases, infs and nans are supported by our approximation technique,
> + but do not set fenv flags correctly. Only trigger special case if we need
> + fenv. */
> + uint32x4_t ix = vreinterpretq_u32_f32 (x);
> + uint32x4_t sign = vandq_u32 (ix, SignMask);
> +
> +#if WANT_SIMD_EXCEPT
> + uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
> + uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
> + v_u32 (BigBound - TinyBound));
> + /* If any lane is special, fall back to the scalar routine for all lanes. */
> + if (__glibc_unlikely (v_any_u32 (special)))
> + return special_case (x, x, v_u32 (-1));
> +#endif
> +
> + /* Argument reduction:
> + y := arctan(x) for x < 1
> + y := pi/2 + arctan(-1/x) for x > 1
> + Hence, use z=-1/a if x>=1, otherwise z=a. */
> + uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
> + /* Avoid dependency in abs(x) in division (and comparison). */
> + float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
> + float32x4_t shift = vreinterpretq_f32_u32 (
> + vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
> + /* Use absolute value only when needed (odd powers of z). */
> + float32x4_t az = vbslq_f32 (
> + SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
> +
> + /* Calculate the polynomial approximation.
> + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
> + a standard implementation using z8 creates spurious underflow
> + in the very last fma (when z^8 is small enough).
> + Therefore, we split the last fma into a mul and an fma.
> + Horner and single-level Estrin have higher errors that exceed
> + threshold. */
> + float32x4_t z2 = vmulq_f32 (z, z);
> + float32x4_t z4 = vmulq_f32 (z2, z2);
> +
> + float32x4_t y = vfmaq_f32 (
> + v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
> + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
> +
> + /* y = shift + z * P(z^2). */
> + y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
> +
> + /* y = atan(x) if x>0, -atan(-x) otherwise. */
> + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
> +
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c
> new file mode 100644
> index 0000000000..9453e7aa29
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atanf_sve.c
> @@ -0,0 +1,79 @@
> +/* Single-precision SVE inverse tan
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f32.h"
> +
> +static const struct data
> +{
> + float32_t poly[8];
> + float32_t pi_over_2;
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + [2**-128, 1.0]. */
> + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
> + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
> + .pi_over_2 = 0x1.921fb6p+0f,
> +};
> +
> +#define SignMask (0x80000000)
> +
> +/* Fast implementation of SVE atanf based on
> + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
> + z=-1/x and shift = pi/2.
> + Largest observed error is 2.9 ULP, close to +/-1.0:
> + _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1
> + want -0x1.967fp-1. */
> +svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* No need to trigger special case. Small cases, infs and nans
> + are supported by our approximation technique. */
> + svuint32_t ix = svreinterpret_u32 (x);
> + svuint32_t sign = svand_x (pg, ix, SignMask);
> +
> + /* Argument reduction:
> + y := arctan(x) for x < 1
> + y := pi/2 + arctan(-1/x) for x > 1
> + Hence, use z=-1/a if x>=1, otherwise z=a. */
> + svbool_t red = svacgt (pg, x, 1.0f);
> + /* Avoid dependency in abs(x) in division (and comparison). */
> + svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x);
> + /* Use absolute value only when needed (odd powers of z). */
> + svfloat32_t az = svabs_x (pg, z);
> + az = svneg_m (az, red, az);
> +
> + /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
> + svfloat32_t z2 = svmul_x (pg, z, z);
> + svfloat32_t z4 = svmul_x (pg, z2, z2);
> + svfloat32_t z8 = svmul_x (pg, z4, z4);
> +
> + svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly);
> +
> + /* y = shift + z + z^3 * P(z^2). */
> + svfloat32_t z3 = svmul_x (pg, z2, az);
> + y = svmla_x (pg, az, z3, y);
> +
> + /* Apply shift as indicated by 'red' predicate. */
> + y = svadd_m (red, y, sv_f32 (d->pi_over_2));
> +
> + /* y = atan(x) if x>0, -atan(-x) otherwise. */
> + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
> +}
> diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
> index f313993d70..37aa74fe50 100644
> --- a/sysdeps/aarch64/fpu/bits/math-vector.h
> +++ b/sysdeps/aarch64/fpu/bits/math-vector.h
> @@ -51,6 +51,7 @@ typedef __SVBool_t __sv_bool_t;
>
> __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
> +__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
> @@ -63,6 +64,7 @@ __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
>
> __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
> +__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
> @@ -80,6 +82,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
>
> __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
> +__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
> @@ -92,6 +95,7 @@ __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
>
> __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
> +__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
> diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> index 5a0cbf743b..6954fe7435 100644
> --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> @@ -25,6 +25,7 @@
>
> VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
> VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
> +VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
> VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
> VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
> VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
> diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> index bd89ff6133..1173d8f9ae 100644
> --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> @@ -34,6 +34,7 @@
>
> SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
> SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
> +SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
> SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
> SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
> SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
> diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> index 3fafca7557..387efc30f8 100644
> --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> @@ -25,6 +25,7 @@
>
> VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
> VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
> +VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
> VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
> VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
> VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
> diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> index b4ec9f777b..dddd4cb213 100644
> --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> @@ -34,6 +34,7 @@
>
> SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
> SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
> +SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
> SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
> SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
> SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> index c2b6f21b9d..24a99e10da 100644
> --- a/sysdeps/aarch64/libm-test-ulps
> +++ b/sysdeps/aarch64/libm-test-ulps
> @@ -121,11 +121,19 @@ double: 1
> float: 1
> ldouble: 2
>
> +Function: "atan_advsimd":
> +double: 1
> +float: 1
> +
> Function: "atan_downward":
> double: 1
> float: 2
> ldouble: 2
>
> +Function: "atan_sve":
> +double: 1
> +float: 1
> +
> Function: "atan_towardzero":
> double: 1
> float: 1
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index f79eaaf241..a2d1b8fb6d 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -16,6 +16,7 @@ GLIBC_2.38 _ZGVsMxv_sin F
> GLIBC_2.38 _ZGVsMxv_sinf F
> GLIBC_2.39 _ZGVnN2v_acos F
> GLIBC_2.39 _ZGVnN2v_asin F
> +GLIBC_2.39 _ZGVnN2v_atan F
> GLIBC_2.39 _ZGVnN2v_exp10 F
> GLIBC_2.39 _ZGVnN2v_exp2 F
> GLIBC_2.39 _ZGVnN2v_log10 F
> @@ -23,6 +24,7 @@ GLIBC_2.39 _ZGVnN2v_log2 F
> GLIBC_2.39 _ZGVnN2v_tan F
> GLIBC_2.39 _ZGVnN4v_acosf F
> GLIBC_2.39 _ZGVnN4v_asinf F
> +GLIBC_2.39 _ZGVnN4v_atanf F
> GLIBC_2.39 _ZGVnN4v_exp10f F
> GLIBC_2.39 _ZGVnN4v_exp2f F
> GLIBC_2.39 _ZGVnN4v_log10f F
> @@ -32,6 +34,8 @@ GLIBC_2.39 _ZGVsMxv_acos F
> GLIBC_2.39 _ZGVsMxv_acosf F
> GLIBC_2.39 _ZGVsMxv_asin F
> GLIBC_2.39 _ZGVsMxv_asinf F
> +GLIBC_2.39 _ZGVsMxv_atan F
> +GLIBC_2.39 _ZGVsMxv_atanf F
> GLIBC_2.39 _ZGVsMxv_exp10 F
> GLIBC_2.39 _ZGVsMxv_exp10f F
> GLIBC_2.39 _ZGVsMxv_exp2 F
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4/6] aarch64: Add vector implementations of atan2 routines
2023-11-03 12:12 ` [PATCH 4/6] aarch64: Add vector implementations of atan2 routines Joe Ramsay
@ 2023-11-10 18:01 ` Szabolcs Nagy
2023-11-14 8:54 ` Szabolcs Nagy
0 siblings, 1 reply; 14+ messages in thread
From: Szabolcs Nagy @ 2023-11-10 18:01 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
The 11/03/2023 12:12, Joe Ramsay wrote:
> ---
this is OK to commit.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Thanks,
> Joe
> sysdeps/aarch64/fpu/Makefile | 1 +
> sysdeps/aarch64/fpu/Versions | 4 +
> sysdeps/aarch64/fpu/atan2_advsimd.c | 121 ++++++++++++++++++
> sysdeps/aarch64/fpu/atan2_sve.c | 118 +++++++++++++++++
> sysdeps/aarch64/fpu/atan2f_advsimd.c | 116 +++++++++++++++++
> sysdeps/aarch64/fpu/atan2f_sve.c | 110 ++++++++++++++++
> sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
> .../fpu/test-double-advsimd-wrappers.c | 1 +
> .../aarch64/fpu/test-double-sve-wrappers.c | 11 ++
> .../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
> sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 11 ++
> .../aarch64/fpu/test-vpcs-vector-wrapper.h | 14 ++
> sysdeps/aarch64/fpu/vecmath_config.h | 11 ++
> sysdeps/aarch64/libm-test-ulps | 8 ++
> .../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
> 15 files changed, 535 insertions(+)
> create mode 100644 sysdeps/aarch64/fpu/atan2_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/atan2_sve.c
> create mode 100644 sysdeps/aarch64/fpu/atan2f_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/atan2f_sve.c
>
> diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
> index 5bd77a749d..364efbeac1 100644
> --- a/sysdeps/aarch64/fpu/Makefile
> +++ b/sysdeps/aarch64/fpu/Makefile
> @@ -1,6 +1,7 @@
> libmvec-supported-funcs = acos \
> asin \
> atan \
> + atan2 \
> cos \
> exp \
> exp10 \
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index dfc3d2dad3..99492b3d33 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -30,6 +30,10 @@ libmvec {
> _ZGVnN2v_atan;
> _ZGVsMxv_atanf;
> _ZGVsMxv_atan;
> + _ZGVnN4vv_atan2f;
> + _ZGVnN2vv_atan2;
> + _ZGVsMxvv_atan2f;
> + _ZGVsMxvv_atan2;
> _ZGVnN4v_exp10f;
> _ZGVnN2v_exp10;
> _ZGVsMxv_exp10f;
> diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
> new file mode 100644
> index 0000000000..fcc6be0d6c
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
> @@ -0,0 +1,121 @@
> +/* Double-precision AdvSIMD atan2
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f64.h"
> +
> +static const struct data
> +{
> + float64x2_t pi_over_2;
> + float64x2_t poly[20];
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + the interval [2**-1022, 1.0]. */
> + .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
> + V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
> + V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
> + V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
> + V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
> + V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
> + V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
> + V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
> + V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
> + V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
> + .pi_over_2 = V2 (0x1.921fb54442d18p+0),
> +};
> +
> +#define SignMask v_u64 (0x8000000000000000)
> +
> +/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
> +static float64x2_t VPCS_ATTR NOINLINE
> +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
> +{
> + return v_call2_f64 (atan2, y, x, ret, cmp);
> +}
> +
> +/* Returns 1 if input is the bit representation of 0, infinity or nan. */
> +static inline uint64x2_t
> +zeroinfnan (uint64x2_t i)
> +{
> + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
> + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
> + v_u64 (2 * asuint64 (INFINITY) - 1));
> +}
> +
> +/* Fast implementation of vector atan2.
> + Maximum observed error is 2.8 ulps:
> + _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
> + got 0x1.92d628ab678ccp-1
> + want 0x1.92d628ab678cfp-1. */
> +float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
> +{
> + const struct data *data_ptr = ptr_barrier (&data);
> +
> + uint64x2_t ix = vreinterpretq_u64_f64 (x);
> + uint64x2_t iy = vreinterpretq_u64_f64 (y);
> +
> + uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
> +
> + uint64x2_t sign_x = vandq_u64 (ix, SignMask);
> + uint64x2_t sign_y = vandq_u64 (iy, SignMask);
> + uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
> +
> + float64x2_t ax = vabsq_f64 (x);
> + float64x2_t ay = vabsq_f64 (y);
> +
> + uint64x2_t pred_xlt0 = vcltzq_f64 (x);
> + uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
> +
> + /* Set up z for call to atan. */
> + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
> + float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
> + float64x2_t z = vdivq_f64 (n, d);
> +
> + /* Work out the correct shift. */
> + float64x2_t shift = vreinterpretq_f64_u64 (
> + vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
> + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
> + shift = vmulq_f64 (shift, data_ptr->pi_over_2);
> +
> + /* Calculate the polynomial approximation.
> + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
> + full scheme to avoid underflow in x^16.
> + The order 19 polynomial P approximates
> + (atan(sqrt(x))-sqrt(x))/x^(3/2). */
> + float64x2_t z2 = vmulq_f64 (z, z);
> + float64x2_t x2 = vmulq_f64 (z2, z2);
> + float64x2_t x4 = vmulq_f64 (x2, x2);
> + float64x2_t x8 = vmulq_f64 (x4, x4);
> + float64x2_t ret
> + = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
> + v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
> +
> + /* Finalize. y = shift + z + z^3 * P(z^2). */
> + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
> + ret = vaddq_f64 (ret, shift);
> +
> + /* Account for the sign of x and y. */
> + ret = vreinterpretq_f64_u64 (
> + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
> +
> + if (__glibc_unlikely (v_any_u64 (special_cases)))
> + return special_case (y, x, ret, special_cases);
> +
> + return ret;
> +}
> diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c
> new file mode 100644
> index 0000000000..6dbc2f3769
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atan2_sve.c
> @@ -0,0 +1,118 @@
> +/* Double-precision SVE atan2
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f64.h"
> +
> +static const struct data
> +{
> + float64_t poly[20];
> + float64_t pi_over_2;
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + [2**-1022, 1.0]. */
> + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
> + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
> + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
> + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
> + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
> + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
> + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
> + .pi_over_2 = 0x1.921fb54442d18p+0,
> +};
> +
> +/* Useful constants. */
> +#define SignMask sv_u64 (0x8000000000000000)
> +
> +/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
> +static svfloat64_t NOINLINE
> +special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
> + const svbool_t cmp)
> +{
> + return sv_call2_f64 (atan2, y, x, ret, cmp);
> +}
> +
> +/* Returns a predicate indicating true if the input is the bit representation
> + of 0, infinity or nan. */
> +static inline svbool_t
> +zeroinfnan (svuint64_t i, const svbool_t pg)
> +{
> + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
> + sv_u64 (2 * asuint64 (INFINITY) - 1));
> +}
> +
> +/* Fast implementation of SVE atan2. Errors are greatest when y and
> + x are reasonably close together. The greatest observed error is 2.28 ULP:
> + _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
> + got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */
> +svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
> +{
> + const struct data *data_ptr = ptr_barrier (&data);
> +
> + svuint64_t ix = svreinterpret_u64 (x);
> + svuint64_t iy = svreinterpret_u64 (y);
> +
> + svbool_t cmp_x = zeroinfnan (ix, pg);
> + svbool_t cmp_y = zeroinfnan (iy, pg);
> + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
> +
> + svuint64_t sign_x = svand_x (pg, ix, SignMask);
> + svuint64_t sign_y = svand_x (pg, iy, SignMask);
> + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
> +
> + svfloat64_t ax = svabs_x (pg, x);
> + svfloat64_t ay = svabs_x (pg, y);
> +
> + svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
> + svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
> +
> + /* Set up z for call to atan. */
> + svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
> + svfloat64_t d = svsel (pred_aygtax, ay, ax);
> + svfloat64_t z = svdiv_x (pg, n, d);
> +
> + /* Work out the correct shift. */
> + svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
> + shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
> + shift = svmul_x (pg, shift, data_ptr->pi_over_2);
> +
> + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
> + svfloat64_t z2 = svmul_x (pg, z, z);
> + svfloat64_t x2 = svmul_x (pg, z2, z2);
> + svfloat64_t x4 = svmul_x (pg, x2, x2);
> + svfloat64_t x8 = svmul_x (pg, x4, x4);
> +
> + svfloat64_t ret = svmla_x (
> + pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly),
> + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8);
> +
> + /* y = shift + z + z^3 * P(z^2). */
> + svfloat64_t z3 = svmul_x (pg, z2, z);
> + ret = svmla_x (pg, z, z3, ret);
> +
> + ret = svadd_m (pg, ret, shift);
> +
> + /* Account for the sign of x and y. */
> + ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
> +
> + if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
> + return special_case (y, x, ret, cmp_xy);
> +
> + return ret;
> +}
> diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
> new file mode 100644
> index 0000000000..5a5a6202d1
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
> @@ -0,0 +1,116 @@
> +/* Single-precision AdvSIMD atan2
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f32.h"
> +
> +static const struct data
> +{
> + float32x4_t poly[8];
> + float32x4_t pi_over_2;
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + [2**-128, 1.0].
> + Generated using fpminimax between FLT_MIN and 1. */
> + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
> + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
> + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
> + .pi_over_2 = V4 (0x1.921fb6p+0f),
> +};
> +
> +#define SignMask v_u32 (0x80000000)
> +
> +/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
> +static float32x4_t VPCS_ATTR NOINLINE
> +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
> +{
> + return v_call2_f32 (atan2f, y, x, ret, cmp);
> +}
> +
> +/* Returns 1 if input is the bit representation of 0, infinity or nan. */
> +static inline uint32x4_t
> +zeroinfnan (uint32x4_t i)
> +{
> + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
> + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
> + v_u32 (2 * 0x7f800000lu - 1));
> +}
> +
> +/* Fast implementation of vector atan2f. Maximum observed error is
> + 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
> + _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
> + want 0x1.967f00p-1. */
> +float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
> +{
> + const struct data *data_ptr = ptr_barrier (&data);
> +
> + uint32x4_t ix = vreinterpretq_u32_f32 (x);
> + uint32x4_t iy = vreinterpretq_u32_f32 (y);
> +
> + uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
> +
> + uint32x4_t sign_x = vandq_u32 (ix, SignMask);
> + uint32x4_t sign_y = vandq_u32 (iy, SignMask);
> + uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y);
> +
> + float32x4_t ax = vabsq_f32 (x);
> + float32x4_t ay = vabsq_f32 (y);
> +
> + uint32x4_t pred_xlt0 = vcltzq_f32 (x);
> + uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
> +
> + /* Set up z for call to atanf. */
> + float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
> + float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
> + float32x4_t z = vdivq_f32 (n, d);
> +
> + /* Work out the correct shift. */
> + float32x4_t shift = vreinterpretq_f32_u32 (
> + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
> + shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
> + shift = vmulq_f32 (shift, data_ptr->pi_over_2);
> +
> + /* Calculate the polynomial approximation.
> + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
> + a standard implementation using z8 creates spurious underflow
> + in the very last fma (when z^8 is small enough).
> + Therefore, we split the last fma into a mul and an fma.
> + Horner and single-level Estrin have higher errors that exceed
> + threshold. */
> + float32x4_t z2 = vmulq_f32 (z, z);
> + float32x4_t z4 = vmulq_f32 (z2, z2);
> +
> + float32x4_t ret = vfmaq_f32 (
> + v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
> + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
> +
> + /* y = shift + z * P(z^2). */
> + ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
> +
> + /* Account for the sign of y. */
> + ret = vreinterpretq_f32_u32 (
> + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
> +
> + if (__glibc_unlikely (v_any_u32 (special_cases)))
> + {
> + return special_case (y, x, ret, special_cases);
> + }
> +
> + return ret;
> +}
> diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c
> new file mode 100644
> index 0000000000..606a62c144
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/atan2f_sve.c
> @@ -0,0 +1,110 @@
> +/* Single-precision SVE atan2
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f32.h"
> +
> +static const struct data
> +{
> + float32_t poly[8];
> + float32_t pi_over_2;
> +} data = {
> + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
> + [2**-128, 1.0]. */
> + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
> + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
> + .pi_over_2 = 0x1.921fb6p+0f,
> +};
> +
> +#define SignMask sv_u32 (0x80000000)
> +
> +/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
> +static inline svfloat32_t
> +special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
> + const svbool_t cmp)
> +{
> + return sv_call2_f32 (atan2f, y, x, ret, cmp);
> +}
> +
> +/* Returns a predicate indicating true if the input is the bit representation
> + of 0, infinity or nan. */
> +static inline svbool_t
> +zeroinfnan (svuint32_t i, const svbool_t pg)
> +{
> + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
> + sv_u32 (2 * 0x7f800000lu - 1));
> +}
> +
> +/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 *
> + P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum
> + observed error is 2.95 ULP:
> + _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
> + want 0x1.967f00p-1. */
> +svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
> +{
> + const struct data *data_ptr = ptr_barrier (&data);
> +
> + svuint32_t ix = svreinterpret_u32 (x);
> + svuint32_t iy = svreinterpret_u32 (y);
> +
> + svbool_t cmp_x = zeroinfnan (ix, pg);
> + svbool_t cmp_y = zeroinfnan (iy, pg);
> + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
> +
> + svuint32_t sign_x = svand_x (pg, ix, SignMask);
> + svuint32_t sign_y = svand_x (pg, iy, SignMask);
> + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
> +
> + svfloat32_t ax = svabs_x (pg, x);
> + svfloat32_t ay = svabs_x (pg, y);
> +
> + svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
> + svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
> +
> + /* Set up z for call to atan. */
> + svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
> + svfloat32_t d = svsel (pred_aygtax, ay, ax);
> + svfloat32_t z = svdiv_x (pg, n, d);
> +
> + /* Work out the correct shift. */
> + svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
> + shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
> + shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
> +
> + /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
> + svfloat32_t z2 = svmul_x (pg, z, z);
> + svfloat32_t z4 = svmul_x (pg, z2, z2);
> + svfloat32_t z8 = svmul_x (pg, z4, z4);
> +
> + svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly);
> +
> + /* ret = shift + z + z^3 * P(z^2). */
> + svfloat32_t z3 = svmul_x (pg, z2, z);
> + ret = svmla_x (pg, z, z3, ret);
> +
> + ret = svadd_m (pg, ret, shift);
> +
> + /* Account for the sign of x and y. */
> + ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
> +
> + if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
> + return special_case (y, x, ret, cmp_xy);
> +
> + return ret;
> +}
> diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
> index 37aa74fe50..7666c09083 100644
> --- a/sysdeps/aarch64/fpu/bits/math-vector.h
> +++ b/sysdeps/aarch64/fpu/bits/math-vector.h
> @@ -49,6 +49,7 @@ typedef __SVBool_t __sv_bool_t;
>
> # define __vpcs __attribute__ ((__aarch64_vector_pcs__))
>
> +__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
> @@ -62,6 +63,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
>
> +__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
> @@ -80,6 +82,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
>
> #ifdef __SVE_VEC_MATH_SUPPORTED
>
> +__sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
> @@ -93,6 +96,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
>
> +__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
> diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> index 6954fe7435..0ac0240171 100644
> --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> @@ -26,6 +26,7 @@
> VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
> VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
> VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
> +VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
> VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
> VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
> VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
> diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> index 1173d8f9ae..5bbc4d58c1 100644
> --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> @@ -32,9 +32,20 @@
> return svlastb_f64 (svptrue_b64 (), mr); \
> }
>
> +#define SVE_VECTOR_WRAPPER_ff(scalar_func, vector_func) \
> + extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE, svbool_t); \
> + FLOAT scalar_func (FLOAT x, FLOAT y) \
> + { \
> + VEC_TYPE mx = svdup_n_f64 (x); \
> + VEC_TYPE my = svdup_n_f64 (y); \
> + VEC_TYPE mr = vector_func (mx, my, svptrue_b64 ()); \
> + return svlastb_f64 (svptrue_b64 (), mr); \
> + }
> +
> SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
> SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
> SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
> +SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
> SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
> SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
> SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
> diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> index 387efc30f8..a557bfc3a6 100644
> --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> @@ -26,6 +26,7 @@
> VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
> VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
> VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
> +VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
> VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
> VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
> VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
> diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> index dddd4cb213..f36939e2c4 100644
> --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> @@ -32,9 +32,20 @@
> return svlastb_f32 (svptrue_b32 (), mr); \
> }
>
> +#define SVE_VECTOR_WRAPPER_ff(scalar_func, vector_func) \
> + extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE, svbool_t); \
> + FLOAT scalar_func (FLOAT x, FLOAT y) \
> + { \
> + VEC_TYPE mx = svdup_n_f32 (x); \
> + VEC_TYPE my = svdup_n_f32 (y); \
> + VEC_TYPE mr = vector_func (mx, my, svptrue_b32 ()); \
> + return svlastb_f32 (svptrue_b32 (), mr); \
> + }
> +
> SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
> SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
> SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
> +SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
> SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
> SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
> SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
> diff --git a/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h b/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h
> index f8e6a3fb9d..9551a9ea6f 100644
> --- a/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h
> +++ b/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h
> @@ -29,3 +29,17 @@
> TEST_VEC_LOOP (mr, VEC_LEN); \
> return ((FLOAT) mr[0]); \
> }
> +
> +#define VPCS_VECTOR_WRAPPER_ff(scalar_func, vector_func) \
> + extern __attribute__ ((aarch64_vector_pcs)) \
> + VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE); \
> + FLOAT scalar_func (FLOAT x, FLOAT y) \
> + { \
> + int i; \
> + VEC_TYPE mx, my; \
> + INIT_VEC_LOOP (mx, x, VEC_LEN); \
> + INIT_VEC_LOOP (my, y, VEC_LEN); \
> + VEC_TYPE mr = vector_func (mx, my); \
> + TEST_VEC_LOOP (mr, VEC_LEN); \
> + return ((FLOAT) mr[0]); \
> + }
> diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
> index 2c8e243236..0e631fbdd5 100644
> --- a/sysdeps/aarch64/fpu/vecmath_config.h
> +++ b/sysdeps/aarch64/fpu/vecmath_config.h
> @@ -35,6 +35,17 @@
> __ptr; \
> })
>
> +static inline uint64_t
> +asuint64 (double f)
> +{
> + union
> + {
> + double f;
> + uint64_t i;
> + } u = { f };
> + return u.i;
> +}
> +
> #define V_LOG_POLY_ORDER 6
> #define V_LOG_TABLE_BITS 7
> extern const struct v_log_data
> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> index 24a99e10da..e0699c44d8 100644
> --- a/sysdeps/aarch64/libm-test-ulps
> +++ b/sysdeps/aarch64/libm-test-ulps
> @@ -106,11 +106,19 @@ Function: "atan2":
> float: 1
> ldouble: 2
>
> +Function: "atan2_advsimd":
> +double: 1
> +float: 2
> +
> Function: "atan2_downward":
> double: 1
> float: 2
> ldouble: 2
>
> +Function: "atan2_sve":
> +double: 1
> +float: 2
> +
> Function: "atan2_towardzero":
> double: 1
> float: 2
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index a2d1b8fb6d..7961a2f374 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -22,6 +22,7 @@ GLIBC_2.39 _ZGVnN2v_exp2 F
> GLIBC_2.39 _ZGVnN2v_log10 F
> GLIBC_2.39 _ZGVnN2v_log2 F
> GLIBC_2.39 _ZGVnN2v_tan F
> +GLIBC_2.39 _ZGVnN2vv_atan2 F
> GLIBC_2.39 _ZGVnN4v_acosf F
> GLIBC_2.39 _ZGVnN4v_asinf F
> GLIBC_2.39 _ZGVnN4v_atanf F
> @@ -30,6 +31,7 @@ GLIBC_2.39 _ZGVnN4v_exp2f F
> GLIBC_2.39 _ZGVnN4v_log10f F
> GLIBC_2.39 _ZGVnN4v_log2f F
> GLIBC_2.39 _ZGVnN4v_tanf F
> +GLIBC_2.39 _ZGVnN4vv_atan2f F
> GLIBC_2.39 _ZGVsMxv_acos F
> GLIBC_2.39 _ZGVsMxv_acosf F
> GLIBC_2.39 _ZGVsMxv_asin F
> @@ -46,3 +48,5 @@ GLIBC_2.39 _ZGVsMxv_log2 F
> GLIBC_2.39 _ZGVsMxv_log2f F
> GLIBC_2.39 _ZGVsMxv_tan F
> GLIBC_2.39 _ZGVsMxv_tanf F
> +GLIBC_2.39 _ZGVsMxvv_atan2 F
> +GLIBC_2.39 _ZGVsMxvv_atan2f F
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 5/6] aarch64: Add vector implementations of log1p routines
2023-11-03 12:12 ` [PATCH 5/6] aarch64: Add vector implementations of log1p routines Joe Ramsay
@ 2023-11-10 18:06 ` Szabolcs Nagy
0 siblings, 0 replies; 14+ messages in thread
From: Szabolcs Nagy @ 2023-11-10 18:06 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
The 11/03/2023 12:12, Joe Ramsay wrote:
> May discard sign of zero.
> ---
i reviewed the generic changes: ignoring sign of zero for
log1p mathvec tests is OK, this can be committed.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Thanks,
> Joe
> math/auto-libm-test-in | 2 +-
> math/auto-libm-test-out-log1p | 50 +++----
> sysdeps/aarch64/fpu/Makefile | 1 +
> sysdeps/aarch64/fpu/Versions | 4 +
> sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
> sysdeps/aarch64/fpu/log1p_advsimd.c | 129 ++++++++++++++++++
> sysdeps/aarch64/fpu/log1p_sve.c | 118 ++++++++++++++++
> sysdeps/aarch64/fpu/log1pf_advsimd.c | 128 +++++++++++++++++
> sysdeps/aarch64/fpu/log1pf_sve.c | 100 ++++++++++++++
> .../fpu/test-double-advsimd-wrappers.c | 1 +
> .../aarch64/fpu/test-double-sve-wrappers.c | 1 +
> .../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
> sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
> sysdeps/aarch64/libm-test-ulps | 8 ++
> .../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
> 15 files changed, 526 insertions(+), 26 deletions(-)
> create mode 100644 sysdeps/aarch64/fpu/log1p_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/log1p_sve.c
> create mode 100644 sysdeps/aarch64/fpu/log1pf_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/log1pf_sve.c
>
> diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
> index 70892503d6..a8d6674c98 100644
> --- a/math/auto-libm-test-in
> +++ b/math/auto-libm-test-in
> @@ -6577,7 +6577,7 @@ log10 0xf.bf1b2p-4
> log10 0x1.6b5f7ap+96
>
> log1p 0
> -log1p -0
> +log1p -0 no-mathvec
> log1p e-1
> log1p -0.25
> log1p -0.875
> diff --git a/math/auto-libm-test-out-log1p b/math/auto-libm-test-out-log1p
> index f7d3b35e6d..f83241f51a 100644
> --- a/math/auto-libm-test-out-log1p
> +++ b/math/auto-libm-test-out-log1p
> @@ -23,31 +23,31 @@ log1p 0
> = log1p tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok
> = log1p towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok
> = log1p upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok
> -log1p -0
> -= log1p downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
> -= log1p upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
> +log1p -0 no-mathvec
> += log1p downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> += log1p upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
> log1p e-1
> = log1p downward binary32 0x1.b7e152p+0 : 0x1p+0 : inexact-ok
> = log1p tonearest binary32 0x1.b7e152p+0 : 0x1p+0 : inexact-ok
> diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
> index 364efbeac1..c77c709edd 100644
> --- a/sysdeps/aarch64/fpu/Makefile
> +++ b/sysdeps/aarch64/fpu/Makefile
> @@ -8,6 +8,7 @@ libmvec-supported-funcs = acos \
> exp2 \
> log \
> log10 \
> + log1p \
> log2 \
> sin \
> tan
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index 99492b3d33..2543649fbe 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -46,6 +46,10 @@ libmvec {
> _ZGVnN2v_log10;
> _ZGVsMxv_log10f;
> _ZGVsMxv_log10;
> + _ZGVnN4v_log1pf;
> + _ZGVnN2v_log1p;
> + _ZGVsMxv_log1pf;
> + _ZGVsMxv_log1p;
> _ZGVnN4v_log2f;
> _ZGVnN2v_log2;
> _ZGVsMxv_log2f;
> diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
> index 7666c09083..51915cef22 100644
> --- a/sysdeps/aarch64/fpu/bits/math-vector.h
> +++ b/sysdeps/aarch64/fpu/bits/math-vector.h
> @@ -59,6 +59,7 @@ __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
> +__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
> @@ -73,6 +74,7 @@ __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
> +__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
> @@ -92,6 +94,7 @@ __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
> +__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
> @@ -106,6 +109,7 @@ __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
> +__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
> diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
> new file mode 100644
> index 0000000000..a117e1b6dc
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
> @@ -0,0 +1,129 @@
> +/* Double-precision AdvSIMD log1p
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f64.h"
> +
> +const static struct data
> +{
> + float64x2_t poly[19], ln2[2];
> + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
> + int64x2_t one_top;
> +} data = {
> + /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
> + .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
> + V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
> + V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
> + V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
> + V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
> + V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
> + V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
> + V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
> + V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
> + V2 (-0x1.cfa7385bdb37ep-6) },
> + .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
> + /* top32(asuint64(sqrt(2)/2)) << 32. */
> + .hf_rt2_top = V2 (0x3fe6a09e00000000),
> + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
> + .one_m_hf_rt2_top = V2 (0x00095f6200000000),
> + .umask = V2 (0x000fffff00000000),
> + .one_top = V2 (0x3ff),
> + .inf = V2 (0x7ff0000000000000),
> + .minus_one = V2 (0xbff0000000000000)
> +};
> +
> +#define BottomMask v_u64 (0xffffffff)
> +
> +static float64x2_t VPCS_ATTR NOINLINE
> +special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
> +{
> + return v_call_f64 (log1p, x, y, special);
> +}
> +
> +/* Vector log1p approximation using polynomial on reduced interval. Routine is
> + a modification of the algorithm used in scalar log1p, with no shortcut for
> + k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
> + _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
> + want 0x1.fd61d0727429fp+2 . */
> +VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> + uint64x2_t ix = vreinterpretq_u64_f64 (x);
> + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
> + uint64x2_t special = vcgeq_u64 (ia, d->inf);
> +
> +#if WANT_SIMD_EXCEPT
> + special = vorrq_u64 (special,
> + vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
> + if (__glibc_unlikely (v_any_u64 (special)))
> + x = v_zerofy_f64 (x, special);
> +#else
> + special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
> +#endif
> +
> + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
> + is in [sqrt(2)/2, sqrt(2)]):
> + log1p(x) = k*log(2) + log1p(f).
> +
> + f may not be representable exactly, so we need a correction term:
> + let m = round(1 + x), c = (1 + x) - m.
> + c << m: at very small x, log1p(x) ~ x, hence:
> + log(1+x) - log(m) ~ c/m.
> +
> + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
> +
> + /* Obtain correctly scaled k by manipulation in the exponent.
> + The scalar algorithm casts down to 32-bit at this point to calculate k and
> + u_red. We stay in double-width to obtain f and k, using the same constants
> + as the scalar algorithm but shifted left by 32. */
> + float64x2_t m = vaddq_f64 (x, v_f64 (1));
> + uint64x2_t mi = vreinterpretq_u64_f64 (m);
> + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
> +
> + int64x2_t ki
> + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
> + float64x2_t k = vcvtq_f64_s64 (ki);
> +
> + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
> + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
> + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
> + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
> +
> + /* Correction term c/m. */
> + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
> +
> + /* Approximate log1p(x) on the reduced input using a polynomial. Because
> + log1p(0)=0 we choose an approximation of the form:
> + x + C0*x^2 + C1*x^3 + C2x^4 + ...
> + Hence approximation has the form f + f^2 * P(f)
> + where P(x) = C0 + C1*x + C2x^2 + ...
> + Assembling this all correctly is dealt with at the final step. */
> + float64x2_t f2 = vmulq_f64 (f, f);
> + float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
> +
> + float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
> + float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
> + float64x2_t y = vaddq_f64 (ylo, yhi);
> +
> + if (__glibc_unlikely (v_any_u64 (special)))
> + return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
> + special);
> +
> + return vfmaq_f64 (y, f2, p);
> +}
> diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
> new file mode 100644
> index 0000000000..169156748d
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/log1p_sve.c
> @@ -0,0 +1,118 @@
> +/* Double-precision SVE log1p
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f64.h"
> +
> +static const struct data
> +{
> + double poly[19];
> + double ln2_hi, ln2_lo;
> + uint64_t hfrt2_top, onemhfrt2_top, inf, mone;
> +} data = {
> + /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20
> + polynomial, however first 2 coefficients are 0 and 1 so are not stored. */
> + .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
> + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
> + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
> + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
> + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
> + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
> + -0x1.cfa7385bdb37ep-6, },
> + .ln2_hi = 0x1.62e42fefa3800p-1,
> + .ln2_lo = 0x1.ef35793c76730p-45,
> + /* top32(asuint64(sqrt(2)/2)) << 32. */
> + .hfrt2_top = 0x3fe6a09e00000000,
> + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
> + .onemhfrt2_top = 0x00095f6200000000,
> + .inf = 0x7ff0000000000000,
> + .mone = 0xbff0000000000000,
> +};
> +
> +#define AbsMask 0x7fffffffffffffff
> +#define BottomMask 0xffffffff
> +
> +static svfloat64_t NOINLINE
> +special_case (svbool_t special, svfloat64_t x, svfloat64_t y)
> +{
> + return sv_call_f64 (log1p, x, y, special);
> +}
> +
> +/* Vector approximation for log1p using polynomial on reduced interval. Maximum
> + observed error is 2.46 ULP:
> + _ZGVsMxv_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
> + want 0x1.fd5565fb590f6p+2. */
> +svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> + svuint64_t ix = svreinterpret_u64 (x);
> + svuint64_t ax = svand_x (pg, ix, AbsMask);
> + svbool_t special
> + = svorr_z (pg, svcmpge (pg, ax, d->inf), svcmpge (pg, ix, d->mone));
> +
> + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
> + is in [sqrt(2)/2, sqrt(2)]):
> + log1p(x) = k*log(2) + log1p(f).
> +
> + f may not be representable exactly, so we need a correction term:
> + let m = round(1 + x), c = (1 + x) - m.
> + c << m: at very small x, log1p(x) ~ x, hence:
> + log(1+x) - log(m) ~ c/m.
> +
> + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
> +
> + /* Obtain correctly scaled k by manipulation in the exponent.
> + The scalar algorithm casts down to 32-bit at this point to calculate k and
> + u_red. We stay in double-width to obtain f and k, using the same constants
> + as the scalar algorithm but shifted left by 32. */
> + svfloat64_t m = svadd_x (pg, x, 1);
> + svuint64_t mi = svreinterpret_u64 (m);
> + svuint64_t u = svadd_x (pg, mi, d->onemhfrt2_top);
> +
> + svint64_t ki = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), 0x3ff);
> + svfloat64_t k = svcvt_f64_x (pg, ki);
> +
> + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
> + svuint64_t utop
> + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top);
> + svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask));
> + svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
> +
> + /* Correction term c/m. */
> + svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m);
> +
> + /* Approximate log1p(x) on the reduced input using a polynomial. Because
> + log1p(0)=0 we choose an approximation of the form:
> + x + C0*x^2 + C1*x^3 + C2x^4 + ...
> + Hence approximation has the form f + f^2 * P(f)
> + where P(x) = C0 + C1*x + C2x^2 + ...
> + Assembling this all correctly is dealt with at the final step. */
> + svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2),
> + f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8);
> + svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly);
> +
> + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo);
> + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi);
> + svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
> +
> + if (__glibc_unlikely (svptest_any (pg, special)))
> + return special_case (special, x, y);
> +
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
> new file mode 100644
> index 0000000000..3748830de8
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
> @@ -0,0 +1,128 @@
> +/* Single-precision AdvSIMD log1p
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f32.h"
> +
> +const static struct data
> +{
> + float32x4_t poly[8], ln2;
> + uint32x4_t tiny_bound, minus_one, four, thresh;
> + int32x4_t three_quarters;
> +} data = {
> + .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
> + (1, -0.5) are not stored as they can be generated more
> + efficiently. */
> + V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
> + V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
> + V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
> + .ln2 = V4 (0x1.62e43p-1f),
> + .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
> + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
> + .minus_one = V4 (0xbf800000),
> + .four = V4 (0x40800000),
> + .three_quarters = V4 (0x3f400000)
> +};
> +
> +static inline float32x4_t
> +eval_poly (float32x4_t m, const float32x4_t *p)
> +{
> + /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
> + float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
> + float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
> + float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
> + float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
> +
> + float32x4_t m2 = vmulq_f32 (m, m);
> + float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
> + float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
> + float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
> +
> + float32x4_t m4 = vmulq_f32 (m2, m2);
> + float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
> + return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
> +}
> +
> +static float32x4_t NOINLINE VPCS_ATTR
> +special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
> +{
> + return v_call_f32 (log1pf, x, y, special);
> +}
> +
> +/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
> + is roughly 2.02 ULP:
> + log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
> +VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + uint32x4_t ix = vreinterpretq_u32_f32 (x);
> + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
> + uint32x4_t special_cases
> + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
> + vcgeq_u32 (ix, d->minus_one));
> + float32x4_t special_arg = x;
> +
> +#if WANT_SIMD_EXCEPT
> + if (__glibc_unlikely (v_any_u32 (special_cases)))
> + /* Side-step special lanes so fenv exceptions are not triggered
> + inadvertently. */
> + x = v_zerofy_f32 (x, special_cases);
> +#endif
> +
> + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
> + is in [-0.25, 0.5]):
> + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
> +
> + We approximate log1p(m) with a polynomial, then scale by
> + k*log(2). Instead of doing this directly, we use an intermediate
> + scale factor s = 4*k*log(2) to ensure the scale is representable
> + as a normalised fp32 number. */
> +
> + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
> +
> + /* Choose k to scale x to the range [-1/4, 1/2]. */
> + int32x4_t k
> + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
> + v_s32 (0xff800000));
> + uint32x4_t ku = vreinterpretq_u32_s32 (k);
> +
> + /* Scale x by exponent manipulation. */
> + float32x4_t m_scale
> + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
> +
> + /* Scale up to ensure that the scale factor is representable as normalised
> + fp32 number, and scale m down accordingly. */
> + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
> + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
> +
> + /* Evaluate polynomial on the reduced interval. */
> + float32x4_t p = eval_poly (m_scale, d->poly);
> +
> + /* The scale factor to be applied back at the end - by multiplying float(k)
> + by 2^-23 we get the unbiased exponent of k. */
> + float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
> +
> + /* Apply the scaling back. */
> + float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
> +
> + if (__glibc_unlikely (v_any_u32 (special_cases)))
> + return special_case (special_arg, y, special_cases);
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
> new file mode 100644
> index 0000000000..712f62b9ce
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/log1pf_sve.c
> @@ -0,0 +1,100 @@
> +/* Single-precision SVE log1p
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f32.h"
> +
> +static const struct data
> +{
> + float poly[8];
> + float ln2, exp_bias;
> + uint32_t four, three_quarters;
> +} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
> + this can be fmov-ed directly instead of including it in
> + the main load-and-mla polynomial schedule. */
> + 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
> + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
> + 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
> + .ln2 = 0x1.62e43p-1f,
> + .exp_bias = 0x1p-23f,
> + .four = 0x40800000,
> + .three_quarters = 0x3f400000};
> +
> +#define SignExponentMask 0xff800000
> +
> +static svfloat32_t NOINLINE
> +special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
> +{
> + return sv_call_f32 (log1pf, x, y, special);
> +}
> +
> +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
> + error is 1.27 ULP very close to 0.5.
> + _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2
> + want 0x1.9f323ep-2. */
> +svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> + /* x < -1, Inf/Nan. */
> + svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
> + special = svorn_z (pg, special, svcmpge (pg, x, -1));
> +
> + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
> + is in [-0.25, 0.5]):
> + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
> +
> + We approximate log1p(m) with a polynomial, then scale by
> + k*log(2). Instead of doing this directly, we use an intermediate
> + scale factor s = 4*k*log(2) to ensure the scale is representable
> + as a normalised fp32 number. */
> + svfloat32_t m = svadd_x (pg, x, 1);
> +
> + /* Choose k to scale x to the range [-1/4, 1/2]. */
> + svint32_t k
> + = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
> + sv_s32 (SignExponentMask));
> +
> + /* Scale x by exponent manipulation. */
> + svfloat32_t m_scale = svreinterpret_f32 (
> + svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
> +
> + /* Scale up to ensure that the scale factor is representable as normalised
> + fp32 number, and scale m down accordingly. */
> + svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
> + m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
> +
> + /* Evaluate polynomial on reduced interval. */
> + svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
> + ms4 = svmul_x (pg, ms2, ms2);
> + svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
> + p = svmad_x (pg, m_scale, p, -0.5);
> + p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
> +
> + /* The scale factor to be applied back at the end - by multiplying float(k)
> + by 2^-23 we get the unbiased exponent of k. */
> + svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
> +
> + /* Apply the scaling back. */
> + svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
> +
> + if (__glibc_unlikely (svptest_any (pg, special)))
> + return special_case (x, y, special);
> +
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> index 0ac0240171..fc9e7aec47 100644
> --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> @@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
> VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
> VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
> VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
> +VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
> VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
> VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
> VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
> diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> index 5bbc4d58c1..aea589d5fb 100644
> --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> @@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
> SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
> SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
> SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
> +SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
> SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
> SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
> SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
> diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> index a557bfc3a6..446fd7f538 100644
> --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> @@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
> VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
> VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
> VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
> +VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
> VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
> VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
> VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
> diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> index f36939e2c4..ac17f60856 100644
> --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> @@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
> SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
> SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
> SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
> +SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
> SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
> SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
> SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> index e0699c44d8..a6b2f29a6f 100644
> --- a/sysdeps/aarch64/libm-test-ulps
> +++ b/sysdeps/aarch64/libm-test-ulps
> @@ -1248,11 +1248,19 @@ double: 1
> float: 1
> ldouble: 3
>
> +Function: "log1p_advsimd":
> +double: 1
> +float: 1
> +
> Function: "log1p_downward":
> double: 1
> float: 2
> ldouble: 3
>
> +Function: "log1p_sve":
> +double: 1
> +float: 1
> +
> Function: "log1p_towardzero":
> double: 2
> float: 2
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index 7961a2f374..0f20b5be29 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -20,6 +20,7 @@ GLIBC_2.39 _ZGVnN2v_atan F
> GLIBC_2.39 _ZGVnN2v_exp10 F
> GLIBC_2.39 _ZGVnN2v_exp2 F
> GLIBC_2.39 _ZGVnN2v_log10 F
> +GLIBC_2.39 _ZGVnN2v_log1p F
> GLIBC_2.39 _ZGVnN2v_log2 F
> GLIBC_2.39 _ZGVnN2v_tan F
> GLIBC_2.39 _ZGVnN2vv_atan2 F
> @@ -29,6 +30,7 @@ GLIBC_2.39 _ZGVnN4v_atanf F
> GLIBC_2.39 _ZGVnN4v_exp10f F
> GLIBC_2.39 _ZGVnN4v_exp2f F
> GLIBC_2.39 _ZGVnN4v_log10f F
> +GLIBC_2.39 _ZGVnN4v_log1pf F
> GLIBC_2.39 _ZGVnN4v_log2f F
> GLIBC_2.39 _ZGVnN4v_tanf F
> GLIBC_2.39 _ZGVnN4vv_atan2f F
> @@ -44,6 +46,8 @@ GLIBC_2.39 _ZGVsMxv_exp2 F
> GLIBC_2.39 _ZGVsMxv_exp2f F
> GLIBC_2.39 _ZGVsMxv_log10 F
> GLIBC_2.39 _ZGVsMxv_log10f F
> +GLIBC_2.39 _ZGVsMxv_log1p F
> +GLIBC_2.39 _ZGVsMxv_log1pf F
> GLIBC_2.39 _ZGVsMxv_log2 F
> GLIBC_2.39 _ZGVsMxv_log2f F
> GLIBC_2.39 _ZGVsMxv_tan F
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 6/6] aarch64: Add vector implementations of expm1 routines
2023-11-03 12:12 ` [PATCH 6/6] aarch64: Add vector implementations of expm1 routines Joe Ramsay
2023-11-06 13:31 ` Adhemerval Zanella Netto
@ 2023-11-10 18:11 ` Szabolcs Nagy
1 sibling, 0 replies; 14+ messages in thread
From: Szabolcs Nagy @ 2023-11-10 18:11 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
The 11/03/2023 12:12, Joe Ramsay wrote:
> ---
please add the sign of zero test exception for
expm1 mathvec test.
> Thanks,
> Joe
> sysdeps/aarch64/fpu/Makefile | 1 +
> sysdeps/aarch64/fpu/Versions | 4 +
> sysdeps/aarch64/fpu/bits/math-vector.h | 4 +
> sysdeps/aarch64/fpu/expm1_advsimd.c | 122 ++++++++++++++++++
> sysdeps/aarch64/fpu/expm1_sve.c | 99 ++++++++++++++
> sysdeps/aarch64/fpu/expm1f_advsimd.c | 117 +++++++++++++++++
> sysdeps/aarch64/fpu/expm1f_sve.c | 99 ++++++++++++++
> .../fpu/test-double-advsimd-wrappers.c | 1 +
> .../aarch64/fpu/test-double-sve-wrappers.c | 1 +
> .../aarch64/fpu/test-float-advsimd-wrappers.c | 1 +
> sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 +
> sysdeps/aarch64/libm-test-ulps | 8 ++
> .../unix/sysv/linux/aarch64/libmvec.abilist | 4 +
> 13 files changed, 462 insertions(+)
> create mode 100644 sysdeps/aarch64/fpu/expm1_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/expm1_sve.c
> create mode 100644 sysdeps/aarch64/fpu/expm1f_advsimd.c
> create mode 100644 sysdeps/aarch64/fpu/expm1f_sve.c
>
> diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
> index c77c709edd..1fe4b52682 100644
> --- a/sysdeps/aarch64/fpu/Makefile
> +++ b/sysdeps/aarch64/fpu/Makefile
> @@ -6,6 +6,7 @@ libmvec-supported-funcs = acos \
> exp \
> exp10 \
> exp2 \
> + expm1 \
> log \
> log10 \
> log1p \
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index 2543649fbe..aaacacaebe 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -42,6 +42,10 @@ libmvec {
> _ZGVnN2v_exp2;
> _ZGVsMxv_exp2f;
> _ZGVsMxv_exp2;
> + _ZGVnN4v_expm1f;
> + _ZGVnN2v_expm1;
> + _ZGVsMxv_expm1f;
> + _ZGVsMxv_expm1;
> _ZGVnN4v_log10f;
> _ZGVnN2v_log10;
> _ZGVsMxv_log10f;
> diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
> index 51915cef22..52aad95e3b 100644
> --- a/sysdeps/aarch64/fpu/bits/math-vector.h
> +++ b/sysdeps/aarch64/fpu/bits/math-vector.h
> @@ -57,6 +57,7 @@ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
> +__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
> __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
> @@ -72,6 +73,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
> +__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
> __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
> @@ -92,6 +94,7 @@ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
> +__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
> __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
> @@ -107,6 +110,7 @@ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
> +__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
> __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
> diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
> new file mode 100644
> index 0000000000..a3aed8e35b
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
> @@ -0,0 +1,122 @@
> +/* Double-precision AdvSIMD expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f64.h"
> +
> +static const struct data
> +{
> + float64x2_t poly[11];
> + float64x2_t invln2, ln2_lo, ln2_hi, shift;
> + int64x2_t exponent_bias;
> +#if WANT_SIMD_EXCEPT
> + uint64x2_t thresh, tiny_bound;
> +#else
> + float64x2_t oflow_bound;
> +#endif
> +} data = {
> + /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
> + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
> + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
> + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
> + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
> + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
> + .invln2 = V2 (0x1.71547652b82fep0),
> + .ln2_hi = V2 (0x1.62e42fefa39efp-1),
> + .ln2_lo = V2 (0x1.abc9e3b39803fp-56),
> + .shift = V2 (0x1.8p52),
> + .exponent_bias = V2 (0x3ff0000000000000),
> +#if WANT_SIMD_EXCEPT
> + /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
> + compare. */
> + .thresh = V2 (0x78c56fa6d34b552),
> + /* asuint64(0x1p-51) << 1. */
> + .tiny_bound = V2 (0x3cc0000000000000 << 1),
> +#else
> + /* Value above which expm1(x) should overflow. Absolute value of the
> + underflow bound is greater than this, so it catches both cases - there is
> + a small window where fallbacks are triggered unnecessarily. */
> + .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
> +#endif
> +};
> +
> +static float64x2_t VPCS_ATTR NOINLINE
> +special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
> +{
> + return v_call_f64 (expm1, x, y, special);
> +}
> +
> +/* Double-precision vector exp(x) - 1 function.
> + The maximum error observed error is 2.18 ULP:
> + _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
> + want 0x1.a8b9ea8d66e2p-2. */
> +float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + uint64x2_t ix = vreinterpretq_u64_f64 (x);
> +
> +#if WANT_SIMD_EXCEPT
> + /* If fp exceptions are to be triggered correctly, fall back to scalar for
> + |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
> + shift-left by 1, and compare with thresh which was left-shifted offline -
> + this is effectively an absolute compare. */
> + uint64x2_t special
> + = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
> + if (__glibc_unlikely (v_any_u64 (special)))
> + x = v_zerofy_f64 (x, special);
> +#else
> + /* Large input, NaNs and Infs. */
> + uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound));
> +#endif
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
> + int64x2_t i = vcvtq_s64_f64 (n);
> + float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi);
> + f = vfmsq_f64 (f, n, d->ln2_lo);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + float64x2_t f2 = vmulq_f64 (f, f);
> + float64x2_t f4 = vmulq_f64 (f2, f2);
> + float64x2_t f8 = vmulq_f64 (f4, f4);
> + float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
> + float64x2_t t = vreinterpretq_f64_s64 (u);
> +
> + if (__glibc_unlikely (v_any_u64 (special)))
> + return special_case (vreinterpretq_f64_u64 (ix),
> + vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
> + special);
> +
> + /* expm1(x) ~= p * t + (t - 1). */
> + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
> +}
> diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
> new file mode 100644
> index 0000000000..50646aff7c
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1_sve.c
> @@ -0,0 +1,99 @@
> +/* Double-precision SVE expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f64.h"
> +
> +#define SpecialBound 0x1.62b7d369a5aa9p+9
> +#define ExponentBias 0x3ff0000000000000
> +
> +static const struct data
> +{
> + double poly[11];
> + double shift, inv_ln2, special_bound;
> + /* To be loaded in one quad-word. */
> + double ln2_hi, ln2_lo;
> +} data = {
> + /* Generated using fpminimax. */
> + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
> + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
> + 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
> + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
> +
> + .special_bound = SpecialBound,
> + .inv_ln2 = 0x1.71547652b82fep0,
> + .ln2_hi = 0x1.62e42fefa39efp-1,
> + .ln2_lo = 0x1.abc9e3b39803fp-56,
> + .shift = 0x1.8p52,
> +};
> +
> +static svfloat64_t NOINLINE
> +special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
> +{
> + return sv_call_f64 (expm1, x, y, pg);
> +}
> +
> +/* Double-precision vector exp(x) - 1 function.
> + The maximum error observed error is 2.18 ULP:
> + _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
> + want 0x1.a8b9ea8d66e2p-2. */
> +svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* Large, Nan/Inf. */
> + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + svfloat64_t shift = sv_f64 (d->shift);
> + svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
> + svint64_t i = svcvt_s64_x (pg, n);
> + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
> + svfloat64_t f = svmls_lane (x, n, ln2, 0);
> + f = svmls_lane (f, n, ln2, 1);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + svfloat64_t f2 = svmul_x (pg, f, f);
> + svfloat64_t f4 = svmul_x (pg, f2, f2);
> + svfloat64_t f8 = svmul_x (pg, f4, f4);
> + svfloat64_t p
> + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
> + svfloat64_t t = svreinterpret_f64 (u);
> +
> + /* expm1(x) ~= p * t + (t - 1). */
> + svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
> +
> + if (__glibc_unlikely (svptest_any (pg, special)))
> + return special_case (x, y, special);
> +
> + return y;
> +}
> diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> new file mode 100644
> index 0000000000..b27b75068a
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> @@ -0,0 +1,117 @@
> +/* Single-precision AdvSIMD expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "v_math.h"
> +#include "poly_advsimd_f32.h"
> +
> +static const struct data
> +{
> + float32x4_t poly[5];
> + float32x4_t invln2, ln2_lo, ln2_hi, shift;
> + int32x4_t exponent_bias;
> +#if WANT_SIMD_EXCEPT
> + uint32x4_t thresh;
> +#else
> + float32x4_t oflow_bound;
> +#endif
> +} data = {
> + /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
> + .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
> + V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
> + .invln2 = V4 (0x1.715476p+0f),
> + .ln2_hi = V4 (0x1.62e4p-1f),
> + .ln2_lo = V4 (0x1.7f7d1cp-20f),
> + .shift = V4 (0x1.8p23f),
> + .exponent_bias = V4 (0x3f800000),
> +#if !WANT_SIMD_EXCEPT
> + /* Value above which expm1f(x) should overflow. Absolute value of the
> + underflow bound is greater than this, so it catches both cases - there is
> + a small window where fallbacks are triggered unnecessarily. */
> + .oflow_bound = V4 (0x1.5ebc4p+6),
> +#else
> + /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
> + compare. */
> + .thresh = V4 (0x1d5ebc40),
> +#endif
> +};
> +
> +/* asuint(0x1p-23), shifted by 1 for abs compare. */
> +#define TinyBound v_u32 (0x34000000 << 1)
> +
> +static float32x4_t VPCS_ATTR NOINLINE
> +special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
> +{
> + return v_call_f32 (expm1f, x, y, special);
> +}
> +
> +/* Single-precision vector exp(x) - 1 function.
> + The maximum error is 1.51 ULP:
> + _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
> + want 0x1.e2fb94p-2. */
> +float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
> +{
> + const struct data *d = ptr_barrier (&data);
> + uint32x4_t ix = vreinterpretq_u32_f32 (x);
> +
> +#if WANT_SIMD_EXCEPT
> + /* If fp exceptions are to be triggered correctly, fall back to scalar for
> + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
> + shift-left by 1, and compare with thresh which was left-shifted offline -
> + this is effectively an absolute compare. */
> + uint32x4_t special
> + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
> + if (__glibc_unlikely (v_any_u32 (special)))
> + x = v_zerofy_f32 (x, special);
> +#else
> + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
> + uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound));
> +#endif
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift);
> + int32x4_t i = vcvtq_s32_f32 (j);
> + float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi);
> + f = vfmsq_f32 (f, j, d->ln2_lo);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + float32x4_t p = v_horner_4_f32 (f, d->poly);
> + p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
> + float32x4_t t = vreinterpretq_f32_s32 (u);
> +
> + if (__glibc_unlikely (v_any_u32 (special)))
> + return special_case (vreinterpretq_f32_u32 (ix),
> + vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
> + special);
> +
> + /* expm1(x) ~= p * t + (t - 1). */
> + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
> +}
> diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
> new file mode 100644
> index 0000000000..96e579e5b7
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/expm1f_sve.c
> @@ -0,0 +1,99 @@
> +/* Single-precision SVE expm1
> +
> + Copyright (C) 2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "sv_math.h"
> +#include "poly_sve_f32.h"
> +
> +/* Largest value of x for which expm1(x) should round to -1. */
> +#define SpecialBound 0x1.5ebc4p+6f
> +
> +static const struct data
> +{
> + /* These 4 are grouped together so they can be loaded as one quadword, then
> + used with _lane forms of svmla/svmls. */
> + float c2, c4, ln2_hi, ln2_lo;
> + float c0, c1, c3, inv_ln2, special_bound, shift;
> +} data = {
> + /* Generated using fpminimax. */
> + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
> + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
> + .c4 = 0x1.6b55a2p-10,
> +
> + .special_bound = SpecialBound, .shift = 0x1.8p23f,
> + .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
> + .ln2_lo = 0x1.7f7d1cp-20f,
> +};
> +
> +#define C(i) sv_f32 (d->c##i)
> +
> +static svfloat32_t NOINLINE
> +special_case (svfloat32_t x, svbool_t pg)
> +{
> + return sv_call_f32 (expm1f, x, x, pg);
> +}
> +
> +/* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP:
> + _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2
> + want 0x1.e859d4p-2. */
> +svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
> +{
> + const struct data *d = ptr_barrier (&data);
> +
> + /* Large, NaN/Inf. */
> + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
> +
> + if (__glibc_unlikely (svptest_any (pg, special)))
> + return special_case (x, pg);
> +
> + /* This vector is reliant on layout of data - it contains constants
> + that can be used with _lane forms of svmla/svmls. Values are:
> + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
> + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
> +
> + /* Reduce argument to smaller range:
> + Let i = round(x / ln2)
> + and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> + where 2^i is exact because i is an integer. */
> + svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
> + j = svsub_x (pg, j, d->shift);
> + svint32_t i = svcvt_s32_x (pg, j);
> +
> + svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
> + f = svmls_lane (f, j, lane_constants, 3);
> +
> + /* Approximate expm1(f) using polynomial.
> + Taylor expansion for expm1(x) has the form:
> + x + ax^2 + bx^3 + cx^4 ....
> + So we calculate the polynomial P(f) = a + bf + cf^2 + ...
> + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
> + svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
> + svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
> + svfloat32_t f2 = svmul_x (pg, f, f);
> + svfloat32_t p = svmla_x (pg, p12, f2, p34);
> + p = svmla_x (pg, C (0), f, p);
> + p = svmla_x (pg, f, f2, p);
> +
> + /* Assemble the result.
> + expm1(x) ~= 2^i * (p + 1) - 1
> + Let t = 2^i. */
> + svfloat32_t t = svreinterpret_f32 (
> + svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
> + return svmla_x (pg, svsub_x (pg, t, 1), p, t);
> +}
> diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> index fc9e7aec47..bf495450d7 100644
> --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
> @@ -31,6 +31,7 @@ VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
> VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
> VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
> VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
> +VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
> VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
> VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
> VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
> diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> index aea589d5fb..b5a627ad47 100644
> --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
> @@ -50,6 +50,7 @@ SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
> SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
> SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
> SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
> +SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
> SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
> SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
> SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
> diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> index 446fd7f538..26d9e98739 100644
> --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
> @@ -31,6 +31,7 @@ VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
> VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
> VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
> VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
> +VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
> VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
> VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
> VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
> diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> index ac17f60856..f286ee64c9 100644
> --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
> @@ -50,6 +50,7 @@ SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
> SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
> SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
> SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
> +SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
> SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
> SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
> SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> index a6b2f29a6f..1d52bf9ebf 100644
> --- a/sysdeps/aarch64/libm-test-ulps
> +++ b/sysdeps/aarch64/libm-test-ulps
> @@ -1078,11 +1078,19 @@ double: 1
> float: 1
> ldouble: 2
>
> +Function: "expm1_advsimd":
> +double: 2
> +float: 1
> +
> Function: "expm1_downward":
> double: 1
> float: 1
> ldouble: 2
>
> +Function: "expm1_sve":
> +double: 2
> +float: 1
> +
> Function: "expm1_towardzero":
> double: 1
> float: 2
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index 0f20b5be29..2bf4ea6332 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -19,6 +19,7 @@ GLIBC_2.39 _ZGVnN2v_asin F
> GLIBC_2.39 _ZGVnN2v_atan F
> GLIBC_2.39 _ZGVnN2v_exp10 F
> GLIBC_2.39 _ZGVnN2v_exp2 F
> +GLIBC_2.39 _ZGVnN2v_expm1 F
> GLIBC_2.39 _ZGVnN2v_log10 F
> GLIBC_2.39 _ZGVnN2v_log1p F
> GLIBC_2.39 _ZGVnN2v_log2 F
> @@ -29,6 +30,7 @@ GLIBC_2.39 _ZGVnN4v_asinf F
> GLIBC_2.39 _ZGVnN4v_atanf F
> GLIBC_2.39 _ZGVnN4v_exp10f F
> GLIBC_2.39 _ZGVnN4v_exp2f F
> +GLIBC_2.39 _ZGVnN4v_expm1f F
> GLIBC_2.39 _ZGVnN4v_log10f F
> GLIBC_2.39 _ZGVnN4v_log1pf F
> GLIBC_2.39 _ZGVnN4v_log2f F
> @@ -44,6 +46,8 @@ GLIBC_2.39 _ZGVsMxv_exp10 F
> GLIBC_2.39 _ZGVsMxv_exp10f F
> GLIBC_2.39 _ZGVsMxv_exp2 F
> GLIBC_2.39 _ZGVsMxv_exp2f F
> +GLIBC_2.39 _ZGVsMxv_expm1 F
> +GLIBC_2.39 _ZGVsMxv_expm1f F
> GLIBC_2.39 _ZGVsMxv_log10 F
> GLIBC_2.39 _ZGVsMxv_log10f F
> GLIBC_2.39 _ZGVsMxv_log1p F
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4/6] aarch64: Add vector implementations of atan2 routines
2023-11-10 18:01 ` Szabolcs Nagy
@ 2023-11-14 8:54 ` Szabolcs Nagy
0 siblings, 0 replies; 14+ messages in thread
From: Szabolcs Nagy @ 2023-11-14 8:54 UTC (permalink / raw)
To: Joe Ramsay, libc-alpha
The 11/10/2023 18:01, Szabolcs Nagy wrote:
> this is OK to commit.
>
> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
make bench fails now on aarch64, please update
sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
scripts to handle two arg functions.
.../benchtests/bench-float-advsimd-atan2.c:25:95: error: excess elements in array initializer [-Werror]
25 | {{-0x1.11003707ad71cp+0, -0x1.1f03ac14205d9p+1, -0x1.88539918864c9p+2, -0x1.4101316c89f72p+1, -0x1.3b7fb80733407p+0, 0x1.029dee7ae42b8p+2, 0x1.77fde1eeb3fa9p+1, 0x1.16a53d1f1be7ep+3}, 0},
| ^
.../benchtests/bench-float-advsimd-atan2.c:25:95: note: (near initialization for ‘in0[0].arg0’)
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2023-11-14 8:55 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-03 12:12 [PATCH 1/6] aarch64: Add vector implementations of asin routines Joe Ramsay
2023-11-03 12:12 ` [PATCH 2/6] aarch64: Add vector implementations of acos routines Joe Ramsay
2023-11-10 17:53 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 3/6] aarch64: Add vector implementations of atan routines Joe Ramsay
2023-11-10 17:54 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 4/6] aarch64: Add vector implementations of atan2 routines Joe Ramsay
2023-11-10 18:01 ` Szabolcs Nagy
2023-11-14 8:54 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 5/6] aarch64: Add vector implementations of log1p routines Joe Ramsay
2023-11-10 18:06 ` Szabolcs Nagy
2023-11-03 12:12 ` [PATCH 6/6] aarch64: Add vector implementations of expm1 routines Joe Ramsay
2023-11-06 13:31 ` Adhemerval Zanella Netto
2023-11-10 18:11 ` Szabolcs Nagy
2023-11-10 17:51 ` [PATCH 1/6] aarch64: Add vector implementations of asin routines Szabolcs Nagy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).