[glibc] aarch64: Add vector implementations of log routines

public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed

* [glibc] aarch64: Add vector implementations of log routines
@ 2023-06-30  8:05 Szabolcs Nagy
  0 siblings, 0 replies; only message in thread
From: Szabolcs Nagy @ 2023-06-30  8:05 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=78c01a5cbeb6717ffa2d4d66bb90ac5c39bd81a9

commit 78c01a5cbeb6717ffa2d4d66bb90ac5c39bd81a9
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date:   Wed Jun 28 12:19:38 2023 +0100

    aarch64: Add vector implementations of log routines
    
    Optimised implementations for single and double precision, Advanced
    SIMD and SVE, copied from Arm Optimized Routines. Log lookup table
    added as HIDDEN symbol to allow it to be shared between AdvSIMD and
    SVE variants.
    
    As previously, data tables are used via a barrier to prevent
    overly aggressive constant inlining. Special-case handlers are
    marked NOINLINE to avoid incurring the penalty of switching call
    standards unnecessarily.
    
    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

Diff:
---
 sysdeps/aarch64/fpu/Makefile                       |   4 +-
 sysdeps/aarch64/fpu/Versions                       |   4 +
 sysdeps/aarch64/fpu/bits/math-vector.h             |   4 +
 sysdeps/aarch64/fpu/log_advsimd.c                  | 105 +++++++++++++
 sysdeps/aarch64/fpu/log_sve.c                      |  80 ++++++++++
 sysdeps/aarch64/fpu/logf_advsimd.c                 |  81 ++++++++++
 sysdeps/aarch64/fpu/logf_sve.c                     |  87 +++++++++++
 sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c |   1 +
 sysdeps/aarch64/fpu/test-double-sve-wrappers.c     |   1 +
 sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c  |   1 +
 sysdeps/aarch64/fpu/test-float-sve-wrappers.c      |   1 +
 sysdeps/aarch64/fpu/v_log_data.c                   | 173 +++++++++++++++++++++
 sysdeps/aarch64/fpu/vecmath_config.h               |  10 ++
 sysdeps/aarch64/libm-test-ulps                     |   8 +
 sysdeps/unix/sysv/linux/aarch64/libmvec.abilist    |   4 +
 15 files changed, 563 insertions(+), 1 deletion(-)

diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 9ceea35148..cc90c4cb75 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,4 +1,5 @@
 libmvec-supported-funcs = cos \
+                          log \
                           sin
 
 float-advsimd-funcs = $(libmvec-supported-funcs)
@@ -10,7 +11,8 @@ ifeq ($(subdir),mathvec)
 libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
                   $(addsuffix _advsimd,$(double-advsimd-funcs)) \
                   $(addsuffix f_sve,$(float-sve-funcs)) \
-                  $(addsuffix _sve,$(double-sve-funcs))
+                  $(addsuffix _sve,$(double-sve-funcs)) \
+                  v_log_data
 endif
 
 sve-cflags = -march=armv8-a+sve
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index d26b3968a9..902446f40d 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -1,11 +1,15 @@
 libmvec {
   GLIBC_2.38 {
     _ZGVnN2v_cos;
+    _ZGVnN2v_log;
     _ZGVnN2v_sin;
     _ZGVnN4v_cosf;
+    _ZGVnN4v_logf;
     _ZGVnN4v_sinf;
     _ZGVsMxv_cos;
     _ZGVsMxv_cosf;
+    _ZGVsMxv_log;
+    _ZGVsMxv_logf;
     _ZGVsMxv_sin;
     _ZGVsMxv_sinf;
   }
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index ad9c9945e8..70c737338e 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -50,9 +50,11 @@ typedef __SVBool_t __sv_bool_t;
 #  define __vpcs __attribute__ ((__aarch64_vector_pcs__))
 
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
 
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 
 #  undef __ADVSIMD_VEC_MATH_SUPPORTED
@@ -61,9 +63,11 @@ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 #ifdef __SVE_VEC_MATH_SUPPORTED
 
 __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
 
 __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
 
 #  undef __SVE_VEC_MATH_SUPPORTED
diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
new file mode 100644
index 0000000000..434737f2a9
--- /dev/null
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
@@ -0,0 +1,105 @@
+/* Double-precision vector (Advanced SIMD) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[5];
+  float64x2_t ln2;
+  uint64x2_t min_norm, special_bound, sign_exp_mask;
+} data = {
+  /* Worst-case error: 1.17 + 0.5 ulp.
+     Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
+	    V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
+	    V2 (-0x1.554e550bd501ep-3) },
+  .ln2 = V2 (0x1.62e42fefa39efp-1),
+  .min_norm = V2 (0x0010000000000000),
+  .special_bound = V2 (0x7fe0000000000000), /* asuint64(inf) - min_norm.  */
+  .sign_exp_mask = V2 (0xfff0000000000000)
+};
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+#define Off v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  e.invc[0] = __v_log_data.invc[i[0] & IndexMask];
+  e.logc[0] = __v_log_data.logc[i[0] & IndexMask];
+  e.invc[1] = __v_log_data.invc[i[1] & IndexMask];
+  e.logc[1] = __v_log_data.logc[i[1] & IndexMask];
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  return v_call_f64 (log, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t z, r, r2, p, y, kd, hi;
+  uint64x2_t ix, iz, tmp, cmp;
+  int64x2_t k;
+  struct entry e;
+
+  ix = vreinterpretq_u64_f64 (x);
+  cmp = vcgeq_u64 (vsubq_u64 (ix, d->min_norm), d->special_bound);
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = vsubq_u64 (ix, Off);
+  k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
+  iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+  z = vreinterpretq_f64_u64 (iz);
+  e = lookup (vshrq_n_u64 (tmp, 52 - V_LOG_TABLE_BITS));
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (A (2), A (3), r);
+  p = vfmaq_f64 (A (0), A (1), r);
+  y = vfmaq_f64 (y, A (4), r2);
+  y = vfmaq_f64 (p, y, r2);
+  y = vfmaq_f64 (hi, y, r2);
+
+  if (__glibc_unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c
new file mode 100644
index 0000000000..93c4f1c018
--- /dev/null
+++ b/sysdeps/aarch64/fpu/log_sve.c
@@ -0,0 +1,80 @@
+/* Double-precision vector (SVE) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+#define P(i) sv_f64 (__v_log_data.poly[i])
+#define N (1 << V_LOG_TABLE_BITS)
+#define Off (0x3fe6900900000000)
+#define MaxTop (0x7ff)
+#define MinTop (0x001)
+#define ThreshTop (0x7fe) /* MaxTop - MinTop.  */
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (log, x, y, cmp);
+}
+
+/* SVE port of AdvSIMD log algorithm.
+   Maximum measured error is 2.17 ulp:
+   SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+					 want 0x1.ffffff1cca045p-2.  */
+svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
+{
+  svuint64_t ix = svreinterpret_u64_f64 (x);
+  svuint64_t top = svlsr_n_u64_x (pg, ix, 52);
+  svbool_t cmp
+      = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, MinTop), sv_u64 (ThreshTop));
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  svuint64_t tmp = svsub_n_u64_x (pg, ix, Off);
+  /* Equivalent to (tmp >> (52 - V_LOG_TABLE_BITS)) % N, since N is a power
+     of 2.  */
+  svuint64_t i = svand_n_u64_x (
+      pg, svlsr_n_u64_x (pg, tmp, (52 - V_LOG_TABLE_BITS)), N - 1);
+  svint64_t k = svasr_n_s64_x (pg, svreinterpret_s64_u64 (tmp),
+			       52); /* Arithmetic shift.  */
+  svuint64_t iz
+      = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
+  svfloat64_t z = svreinterpret_f64_u64 (iz);
+  /* Lookup in 2 global lists (length N).  */
+  svfloat64_t invc = svld1_gather_u64index_f64 (pg, __v_log_data.invc, i);
+  svfloat64_t logc = svld1_gather_u64index_f64 (pg, __v_log_data.logc, i);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  svfloat64_t r = svmad_n_f64_x (pg, invc, z, -1);
+  svfloat64_t kd = svcvt_f64_s64_x (pg, k);
+  /* hi = r + log(c) + k*Ln2.  */
+  svfloat64_t hi
+      = svmla_n_f64_x (pg, svadd_f64_x (pg, logc, r), kd, __v_log_data.ln2);
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  svfloat64_t r2 = svmul_f64_x (pg, r, r);
+  svfloat64_t y = svmla_f64_x (pg, P (2), r, P (3));
+  svfloat64_t p = svmla_f64_x (pg, P (0), r, P (1));
+  y = svmla_f64_x (pg, y, r2, P (4));
+  y = svmla_f64_x (pg, p, r2, y);
+  y = svmla_f64_x (pg, hi, r2, y);
+
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
new file mode 100644
index 0000000000..375ad28b9f
--- /dev/null
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -0,0 +1,81 @@
+/* Single-precision vector (Advanced SIMD) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[7];
+  float32x4_t ln2, tiny_bound;
+  uint32x4_t min_norm, special_bound, off, mantissa_mask;
+} data = {
+  /* 3.34 ulp error.  */
+  .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
+	    V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
+	    V4 (-0x1.ffffc8p-2f) },
+  .ln2 = V4 (0x1.62e43p-1f),
+  .tiny_bound = V4 (0x1p-126),
+  .min_norm = V4 (0x00800000),
+  .special_bound = V4 (0x7f000000), /* asuint32(inf) - min_norm.  */
+  .off = V4 (0x3f2aaaab),	    /* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff)
+};
+
+#define P(i) d->poly[7 - i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (logf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, p, q, r, r2, y;
+  uint32x4_t u, cmp;
+
+  u = vreinterpretq_u32_f32 (x);
+  cmp = vcgeq_u32 (vsubq_u32 (u, d->min_norm), d->special_bound);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = vsubq_u32 (u, d->off);
+  n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
+  u = vandq_u32 (u, d->mantissa_mask);
+  u = vaddq_u32 (u, d->off);
+  r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log(1+r) + n*ln2.  */
+  r2 = vmulq_f32 (r, r);
+  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+  p = vfmaq_f32 (P (5), P (6), r);
+  q = vfmaq_f32 (P (3), P (4), r);
+  y = vfmaq_f32 (P (1), P (2), r);
+  p = vfmaq_f32 (p, P (7), r2);
+  q = vfmaq_f32 (q, p, r2);
+  y = vfmaq_f32 (y, q, r2);
+  p = vfmaq_f32 (r, d->ln2, n);
+  y = vfmaq_f32 (p, y, r2);
+
+  if (__glibc_unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
new file mode 100644
index 0000000000..46c6e7c461
--- /dev/null
+++ b/sysdeps/aarch64/fpu/logf_sve.c
@@ -0,0 +1,87 @@
+/* Single-precision vector (SVE) log function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float poly_0135[4];
+  float poly_246[3];
+  float ln2;
+} data = {
+  .poly_0135 = {
+    /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
+       that coeffs 0, 1, 3 and 5 can be loaded as a single quad-word, hence used
+       with _lane variant of MLA intrinsic.  */
+    -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
+  },
+  .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
+  .ln2 = 0x1.62e43p-1f
+};
+
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Thresh (0x7f000000) /* Max - Min.  */
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667.  */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (logf, x, y, cmp);
+}
+
+/* Optimised implementation of SVE logf, using the same algorithm and
+   polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs:
+   SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2
+				  want 0x1.26ede6p-2.  */
+svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svuint32_t u = svreinterpret_u32_f32 (x);
+  svbool_t cmp = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, u, Min), Thresh);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  svfloat32_t n
+      = svcvt_f32_s32_x (pg, svasr_n_s32_x (pg, svreinterpret_s32_u32 (u),
+					    23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  svfloat32_t r = svsub_n_f32_x (pg, svreinterpret_f32_u32 (u), 1.0f);
+
+  /* y = log(1+r) + n*ln2.  */
+  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
+  svfloat32_t p_0135 = svld1rq_f32 (svptrue_b32 (), &d->poly_0135[0]);
+  svfloat32_t p = svmla_lane_f32 (sv_f32 (d->poly_246[0]), r, p_0135, 1);
+  svfloat32_t q = svmla_lane_f32 (sv_f32 (d->poly_246[1]), r, p_0135, 2);
+  svfloat32_t y = svmla_lane_f32 (sv_f32 (d->poly_246[2]), r, p_0135, 3);
+  p = svmla_lane_f32 (p, r2, p_0135, 0);
+
+  q = svmla_f32_x (pg, q, r2, p);
+  y = svmla_f32_x (pg, y, r2, q);
+  p = svmla_n_f32_x (pg, r, n, d->ln2);
+  y = svmla_f32_x (pg, p, r2, y);
+
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 4af97a25a2..c5f6fcd7c4 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -24,4 +24,5 @@
 #define VEC_TYPE float64x2_t
 
 VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
 VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 64c790adc5..d5e2ec6dc5 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -33,4 +33,5 @@
   }
 
 SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
 SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 50e776b952..c240738837 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -24,4 +24,5 @@
 #define VEC_TYPE float32x4_t
 
 VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
 VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 7355032929..5a06b75857 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -33,4 +33,5 @@
   }
 
 SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
 SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
diff --git a/sysdeps/aarch64/fpu/v_log_data.c b/sysdeps/aarch64/fpu/v_log_data.c
new file mode 100644
index 0000000000..6fd6f43695
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_log_data.c
@@ -0,0 +1,173 @@
+/* Lookup table for double-precision log(x) vector function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "vecmath_config.h"
+
+const struct v_log_data __v_log_data = {
+  /* Worst-case error: 1.17 + 0.5 ulp.
+     Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
+	    0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 },
+  .ln2 = 0x1.62e42fefa39efp-1,
+  /* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+     where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+     N=128) and log(c) and 1/c for the ith subinterval comes from two lookup
+     tables:
+
+	invc[i] = 1/c
+	logc[i] = (double)log(c)
+
+     where c is near the center of the subinterval and is chosen by trying
+     several floating point invc candidates around 1/center and selecting one
+     for which the error in (double)log(c) is minimized (< 0x1p-74), except the
+     subinterval that contains 1 and the previous one got tweaked to avoid
+     cancellation.  */
+  .invc = { 0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
+	    0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
+	    0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
+	    0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
+	    0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
+	    0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
+	    0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
+	    0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
+	    0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
+	    0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
+	    0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
+	    0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
+	    0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
+	    0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
+	    0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
+	    0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
+	    0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
+	    0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
+	    0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
+	    0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
+	    0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
+	    0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
+	    0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
+	    0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
+	    0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
+	    0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
+	    0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
+	    0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
+	    0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
+	    0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
+	    0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
+	    0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
+	    0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
+	    0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
+	    0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
+	    0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
+	    0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
+	    0x1.010037d38bcc2p+0, 1.0,
+	    0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
+	    0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
+	    0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
+	    0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
+	    0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
+	    0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
+	    0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
+	    0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
+	    0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
+	    0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
+	    0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
+	    0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
+	    0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
+	    0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
+	    0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
+	    0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
+	    0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
+	    0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
+	    0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
+	    0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
+	    0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
+	    0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
+	    0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
+	    0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
+	    0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
+	    0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1 },
+  .logc = { -0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
+	    -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
+	    -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
+	    -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
+	    -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
+	    -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
+	    -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
+	    -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
+	    -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
+	    -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
+	    -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
+	    -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
+	    -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
+	    -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
+	    -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
+	    -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
+	    -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
+	    -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
+	    -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
+	    -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
+	    -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
+	    -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
+	    -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
+	    -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
+	    -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
+	    -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
+	    -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
+	    -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
+	    -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
+	    -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
+	    -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
+	    -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
+	    -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
+	    -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
+	    -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
+	    -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
+	    -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
+	    -0x1.ff6fe1feb4e53p-9, 0.0,
+	    0x1.fe91885ec8e20p-8,  0x1.fc516f716296dp-7,
+	    0x1.7bb4dd70a015bp-6,  0x1.f84c99b34b674p-6,
+	    0x1.39f9ce4fb2d71p-5,  0x1.7756c0fd22e78p-5,
+	    0x1.b43ee82db8f3ap-5,  0x1.f0b3fced60034p-5,
+	    0x1.165bd78d4878ep-4,  0x1.3425d2715ebe6p-4,
+	    0x1.51b8bd91b7915p-4,  0x1.6f15632c76a47p-4,
+	    0x1.8c3c88ecbe503p-4,  0x1.a92ef077625dap-4,
+	    0x1.c5ed5745fa006p-4,  0x1.e27876de1c993p-4,
+	    0x1.fed104fce4cdcp-4,  0x1.0d7bd9c17d78bp-3,
+	    0x1.1b76986cef97bp-3,  0x1.295913d24f750p-3,
+	    0x1.37239fa295d17p-3,  0x1.44d68dd78714bp-3,
+	    0x1.52722ebe5d780p-3,  0x1.5ff6d12671f98p-3,
+	    0x1.6d64c2389484bp-3,  0x1.7abc4da40fddap-3,
+	    0x1.87fdbda1e8452p-3,  0x1.95295b06a5f37p-3,
+	    0x1.a23f6d34abbc5p-3,  0x1.af403a28e04f2p-3,
+	    0x1.bc2c06a85721ap-3,  0x1.c903161240163p-3,
+	    0x1.d5c5aa93287ebp-3,  0x1.e274051823fa9p-3,
+	    0x1.ef0e656300c16p-3,  0x1.fb9509f05aa2ap-3,
+	    0x1.04041821f37afp-2,  0x1.0a340a49b3029p-2,
+	    0x1.105a7918a126dp-2,  0x1.1677819812b84p-2,
+	    0x1.1c8b405b40c0ep-2,  0x1.2295d16cfa6b1p-2,
+	    0x1.28975066318a2p-2,  0x1.2e8fd855d86fcp-2,
+	    0x1.347f83d605e59p-2,  0x1.3a666d1244588p-2,
+	    0x1.4044adb6f8ec4p-2,  0x1.461a5f077558cp-2,
+	    0x1.4be799e20b9c8p-2,  0x1.51ac76a6b79dfp-2,
+	    0x1.57690d5744a45p-2,  0x1.5d1d758e45217p-2 }
+};
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
index d0bdbb4ae8..0920658a0c 100644
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -35,4 +35,14 @@
     __ptr;                                                                    \
   })
 
+#define V_LOG_POLY_ORDER 6
+#define V_LOG_TABLE_BITS 7
+extern const struct v_log_data
+{
+  /* Shared data for vector log and log-derived routines (e.g. asinh).  */
+  double poly[V_LOG_POLY_ORDER - 1];
+  double ln2;
+  double invc[1 << V_LOG_TABLE_BITS];
+  double logc[1 << V_LOG_TABLE_BITS];
+} __v_log_data attribute_hidden;
 #endif
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 4145662b2d..7bdbf4614d 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1219,10 +1219,18 @@ double: 3
 float: 3
 ldouble: 1
 
+Function: "log_advsimd":
+double: 1
+float: 3
+
 Function: "log_downward":
 float: 2
 ldouble: 1
 
+Function: "log_sve":
+double: 1
+float: 3
+
 Function: "log_towardzero":
 float: 2
 ldouble: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index a4c564859c..1922191886 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -1,8 +1,12 @@
 GLIBC_2.38 _ZGVnN2v_cos F
+GLIBC_2.38 _ZGVnN2v_log F
 GLIBC_2.38 _ZGVnN2v_sin F
 GLIBC_2.38 _ZGVnN4v_cosf F
+GLIBC_2.38 _ZGVnN4v_logf F
 GLIBC_2.38 _ZGVnN4v_sinf F
 GLIBC_2.38 _ZGVsMxv_cos F
 GLIBC_2.38 _ZGVsMxv_cosf F
+GLIBC_2.38 _ZGVsMxv_log F
+GLIBC_2.38 _ZGVsMxv_logf F
 GLIBC_2.38 _ZGVsMxv_sin F
 GLIBC_2.38 _ZGVsMxv_sinf F

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-06-30  8:05 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-30  8:05 [glibc] aarch64: Add vector implementations of log routines Szabolcs Nagy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).