public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc] aarch64: Optimise vecmath logs
@ 2023-10-05 15:57 Szabolcs Nagy
  0 siblings, 0 replies; only message in thread
From: Szabolcs Nagy @ 2023-10-05 15:57 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=5a4b6f8e4b7e2a76c71b713200a80181d745c93d

commit 5a4b6f8e4b7e2a76c71b713200a80181d745c93d
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date:   Wed Oct 4 10:38:57 2023 +0100

    aarch64: Optimise vecmath logs
    
    * Transpose table layout for improved memory access
    * Use half-vector special comparisons for AdvSIMD
    * Improve register use near special-case branches
      - Due to the presence of a function call, return value would get
        mov-d out of x0 in order to facilitate PCS. By moving the final
        computation after the branch this can be avoided
    
    Also change SVE routines to use overloaded intrinsics for readability.

Diff:
---
 sysdeps/aarch64/fpu/log_advsimd.c    |  36 +++--
 sysdeps/aarch64/fpu/log_sve.c        |  52 ++++---
 sysdeps/aarch64/fpu/logf_advsimd.c   |  26 ++--
 sysdeps/aarch64/fpu/logf_sve.c       |  40 +++---
 sysdeps/aarch64/fpu/v_log_data.c     | 260 +++++++++++++++++------------------
 sysdeps/aarch64/fpu/v_math.h         |  21 +--
 sysdeps/aarch64/fpu/vecmath_config.h |   6 +-
 7 files changed, 226 insertions(+), 215 deletions(-)

diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
index 434737f2a9..8b32d1cfe1 100644
--- a/sysdeps/aarch64/fpu/log_advsimd.c
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
@@ -21,9 +21,11 @@
 
 static const struct data
 {
+  uint64x2_t min_norm;
+  uint32x4_t special_bound;
   float64x2_t poly[5];
   float64x2_t ln2;
-  uint64x2_t min_norm, special_bound, sign_exp_mask;
+  uint64x2_t sign_exp_mask;
 } data = {
   /* Worst-case error: 1.17 + 0.5 ulp.
      Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
@@ -32,7 +34,7 @@ static const struct data
 	    V2 (-0x1.554e550bd501ep-3) },
   .ln2 = V2 (0x1.62e42fefa39efp-1),
   .min_norm = V2 (0x0010000000000000),
-  .special_bound = V2 (0x7fe0000000000000), /* asuint64(inf) - min_norm.  */
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm.  */
   .sign_exp_mask = V2 (0xfff0000000000000)
 };
 
@@ -52,29 +54,34 @@ lookup (uint64x2_t i)
 {
   /* Since N is a power of 2, n % N = n & (N - 1).  */
   struct entry e;
-  e.invc[0] = __v_log_data.invc[i[0] & IndexMask];
-  e.logc[0] = __v_log_data.logc[i[0] & IndexMask];
-  e.invc[1] = __v_log_data.invc[i[1] & IndexMask];
-  e.logc[1] = __v_log_data.logc[i[1] & IndexMask];
+  uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
   return e;
 }
 
 static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+	      uint32x2_t cmp)
 {
-  return v_call_f64 (log, x, y, cmp);
+  return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
 }
 
 float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float64x2_t z, r, r2, p, y, kd, hi;
-  uint64x2_t ix, iz, tmp, cmp;
+  uint64x2_t ix, iz, tmp;
+  uint32x2_t cmp;
   int64x2_t k;
   struct entry e;
 
   ix = vreinterpretq_u64_f64 (x);
-  cmp = vcgeq_u64 (vsubq_u64 (ix, d->min_norm), d->special_bound);
+  cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+		  vget_low_u32 (d->special_bound));
 
   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
      The range is split into N subintervals.
@@ -83,7 +90,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
   k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
   iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
   z = vreinterpretq_f64_u64 (iz);
-  e = lookup (vshrq_n_u64 (tmp, 52 - V_LOG_TABLE_BITS));
+  e = lookup (tmp);
 
   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
   r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
@@ -97,9 +104,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
   p = vfmaq_f64 (A (0), A (1), r);
   y = vfmaq_f64 (y, A (4), r2);
   y = vfmaq_f64 (p, y, r2);
-  y = vfmaq_f64 (hi, y, r2);
 
-  if (__glibc_unlikely (v_any_u64 (cmp)))
-    return special_case (x, y, cmp);
-  return y;
+  if (__glibc_unlikely (v_any_u32h (cmp)))
+    return special_case (x, y, hi, r2, cmp);
+  return vfmaq_f64 (hi, y, r2);
 }
diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c
index 93c4f1c018..0c171a4d01 100644
--- a/sysdeps/aarch64/fpu/log_sve.c
+++ b/sysdeps/aarch64/fpu/log_sve.c
@@ -38,43 +38,39 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
 					 want 0x1.ffffff1cca045p-2.  */
 svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
 {
-  svuint64_t ix = svreinterpret_u64_f64 (x);
-  svuint64_t top = svlsr_n_u64_x (pg, ix, 52);
-  svbool_t cmp
-      = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, MinTop), sv_u64 (ThreshTop));
+  svuint64_t ix = svreinterpret_u64 (x);
+  svuint64_t top = svlsr_x (pg, ix, 52);
+  svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
 
   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
-  svuint64_t tmp = svsub_n_u64_x (pg, ix, Off);
-  /* Equivalent to (tmp >> (52 - V_LOG_TABLE_BITS)) % N, since N is a power
-     of 2.  */
-  svuint64_t i = svand_n_u64_x (
-      pg, svlsr_n_u64_x (pg, tmp, (52 - V_LOG_TABLE_BITS)), N - 1);
-  svint64_t k = svasr_n_s64_x (pg, svreinterpret_s64_u64 (tmp),
-			       52); /* Arithmetic shift.  */
-  svuint64_t iz
-      = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
-  svfloat64_t z = svreinterpret_f64_u64 (iz);
+  svuint64_t tmp = svsub_x (pg, ix, Off);
+  /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
+     The actual value of i is double this due to table layout.  */
+  svuint64_t i
+      = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
+  svint64_t k
+      = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift.  */
+  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+  svfloat64_t z = svreinterpret_f64 (iz);
   /* Lookup in 2 global lists (length N).  */
-  svfloat64_t invc = svld1_gather_u64index_f64 (pg, __v_log_data.invc, i);
-  svfloat64_t logc = svld1_gather_u64index_f64 (pg, __v_log_data.logc, i);
+  svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+  svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
 
   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
-  svfloat64_t r = svmad_n_f64_x (pg, invc, z, -1);
-  svfloat64_t kd = svcvt_f64_s64_x (pg, k);
+  svfloat64_t r = svmad_x (pg, invc, z, -1);
+  svfloat64_t kd = svcvt_f64_x (pg, k);
   /* hi = r + log(c) + k*Ln2.  */
-  svfloat64_t hi
-      = svmla_n_f64_x (pg, svadd_f64_x (pg, logc, r), kd, __v_log_data.ln2);
+  svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
   /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
-  svfloat64_t r2 = svmul_f64_x (pg, r, r);
-  svfloat64_t y = svmla_f64_x (pg, P (2), r, P (3));
-  svfloat64_t p = svmla_f64_x (pg, P (0), r, P (1));
-  y = svmla_f64_x (pg, y, r2, P (4));
-  y = svmla_f64_x (pg, p, r2, y);
-  y = svmla_f64_x (pg, hi, r2, y);
+  svfloat64_t r2 = svmul_x (pg, r, r);
+  svfloat64_t y = svmla_x (pg, P (2), r, P (3));
+  svfloat64_t p = svmla_x (pg, P (0), r, P (1));
+  y = svmla_x (pg, y, r2, P (4));
+  y = svmla_x (pg, p, r2, y);
 
   if (__glibc_unlikely (svptest_any (pg, cmp)))
-    return special_case (x, y, cmp);
-  return y;
+    return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
+  return svmla_x (pg, hi, r2, y);
 }
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index 375ad28b9f..93903c7962 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -21,9 +21,11 @@
 
 static const struct data
 {
+  uint32x4_t min_norm;
+  uint16x8_t special_bound;
   float32x4_t poly[7];
   float32x4_t ln2, tiny_bound;
-  uint32x4_t min_norm, special_bound, off, mantissa_mask;
+  uint32x4_t off, mantissa_mask;
 } data = {
   /* 3.34 ulp error.  */
   .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
@@ -32,28 +34,31 @@ static const struct data
   .ln2 = V4 (0x1.62e43p-1f),
   .tiny_bound = V4 (0x1p-126),
   .min_norm = V4 (0x00800000),
-  .special_bound = V4 (0x7f000000), /* asuint32(inf) - min_norm.  */
-  .off = V4 (0x3f2aaaab),	    /* 0.666667.  */
+  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
   .mantissa_mask = V4 (0x007fffff)
 };
 
 #define P(i) d->poly[7 - i]
 
 static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
+	      uint16x4_t cmp)
 {
   /* Fall back to scalar code.  */
-  return v_call_f32 (logf, x, y, cmp);
+  return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
 }
 
 float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t n, p, q, r, r2, y;
-  uint32x4_t u, cmp;
+  uint32x4_t u;
+  uint16x4_t cmp;
 
   u = vreinterpretq_u32_f32 (x);
-  cmp = vcgeq_u32 (vsubq_u32 (u, d->min_norm), d->special_bound);
+  cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+		  vget_low_u16 (d->special_bound));
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
   u = vsubq_u32 (u, d->off);
@@ -73,9 +78,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
   q = vfmaq_f32 (q, p, r2);
   y = vfmaq_f32 (y, q, r2);
   p = vfmaq_f32 (r, d->ln2, n);
-  y = vfmaq_f32 (p, y, r2);
 
-  if (__glibc_unlikely (v_any_u32 (cmp)))
-    return special_case (x, y, cmp);
-  return y;
+  if (__glibc_unlikely (v_any_u16h (cmp)))
+    return special_case (x, y, r2, p, cmp);
+  return vfmaq_f32 (p, y, r2);
 }
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
index 46c6e7c461..c02761188a 100644
--- a/sysdeps/aarch64/fpu/logf_sve.c
+++ b/sysdeps/aarch64/fpu/logf_sve.c
@@ -55,33 +55,31 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  svuint32_t u = svreinterpret_u32_f32 (x);
-  svbool_t cmp = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, u, Min), Thresh);
+  svuint32_t u = svreinterpret_u32 (x);
+  svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u = svsub_n_u32_x (pg, u, Off);
-  svfloat32_t n
-      = svcvt_f32_s32_x (pg, svasr_n_s32_x (pg, svreinterpret_s32_u32 (u),
-					    23)); /* Sign-extend.  */
-  u = svand_n_u32_x (pg, u, Mask);
-  u = svadd_n_u32_x (pg, u, Off);
-  svfloat32_t r = svsub_n_f32_x (pg, svreinterpret_f32_u32 (u), 1.0f);
+  u = svsub_x (pg, u, Off);
+  svfloat32_t n = svcvt_f32_x (
+      pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend.  */
+  u = svand_x (pg, u, Mask);
+  u = svadd_x (pg, u, Off);
+  svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
 
   /* y = log(1+r) + n*ln2.  */
-  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  svfloat32_t r2 = svmul_x (pg, r, r);
   /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
-  svfloat32_t p_0135 = svld1rq_f32 (svptrue_b32 (), &d->poly_0135[0]);
-  svfloat32_t p = svmla_lane_f32 (sv_f32 (d->poly_246[0]), r, p_0135, 1);
-  svfloat32_t q = svmla_lane_f32 (sv_f32 (d->poly_246[1]), r, p_0135, 2);
-  svfloat32_t y = svmla_lane_f32 (sv_f32 (d->poly_246[2]), r, p_0135, 3);
-  p = svmla_lane_f32 (p, r2, p_0135, 0);
+  svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
+  svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
+  svfloat32_t q = svmla_lane (sv_f32 (d->poly_246[1]), r, p_0135, 2);
+  svfloat32_t y = svmla_lane (sv_f32 (d->poly_246[2]), r, p_0135, 3);
+  p = svmla_lane (p, r2, p_0135, 0);
 
-  q = svmla_f32_x (pg, q, r2, p);
-  y = svmla_f32_x (pg, y, r2, q);
-  p = svmla_n_f32_x (pg, r, n, d->ln2);
-  y = svmla_f32_x (pg, p, r2, y);
+  q = svmla_x (pg, q, r2, p);
+  y = svmla_x (pg, y, r2, q);
+  p = svmla_x (pg, r, n, d->ln2);
 
   if (__glibc_unlikely (svptest_any (pg, cmp)))
-    return special_case (x, y, cmp);
-  return y;
+    return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+  return svmla_x (pg, p, r2, y);
 }
diff --git a/sysdeps/aarch64/fpu/v_log_data.c b/sysdeps/aarch64/fpu/v_log_data.c
index 6fd6f43695..99506e3cde 100644
--- a/sysdeps/aarch64/fpu/v_log_data.c
+++ b/sysdeps/aarch64/fpu/v_log_data.c
@@ -34,140 +34,140 @@ const struct v_log_data __v_log_data = {
      N=128) and log(c) and 1/c for the ith subinterval comes from two lookup
      tables:
 
-	invc[i] = 1/c
-	logc[i] = (double)log(c)
+	table[i].invc = 1/c
+	table[i].logc = (double)log(c)
 
      where c is near the center of the subinterval and is chosen by trying
      several floating point invc candidates around 1/center and selecting one
      for which the error in (double)log(c) is minimized (< 0x1p-74), except the
      subinterval that contains 1 and the previous one got tweaked to avoid
      cancellation.  */
-  .invc = { 0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
-	    0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
-	    0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
-	    0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
-	    0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
-	    0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
-	    0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
-	    0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
-	    0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
-	    0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
-	    0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
-	    0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
-	    0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
-	    0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
-	    0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
-	    0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
-	    0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
-	    0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
-	    0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
-	    0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
-	    0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
-	    0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
-	    0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
-	    0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
-	    0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
-	    0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
-	    0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
-	    0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
-	    0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
-	    0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
-	    0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
-	    0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
-	    0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
-	    0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
-	    0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
-	    0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
-	    0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
-	    0x1.010037d38bcc2p+0, 1.0,
-	    0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
-	    0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
-	    0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
-	    0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
-	    0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
-	    0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
-	    0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
-	    0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
-	    0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
-	    0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
-	    0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
-	    0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
-	    0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
-	    0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
-	    0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
-	    0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
-	    0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
-	    0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
-	    0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
-	    0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
-	    0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
-	    0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
-	    0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
-	    0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
-	    0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
-	    0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1 },
-  .logc = { -0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
-	    -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
-	    -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
-	    -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
-	    -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
-	    -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
-	    -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
-	    -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
-	    -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
-	    -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
-	    -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
-	    -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
-	    -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
-	    -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
-	    -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
-	    -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
-	    -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
-	    -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
-	    -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
-	    -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
-	    -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
-	    -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
-	    -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
-	    -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
-	    -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
-	    -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
-	    -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
-	    -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
-	    -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
-	    -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
-	    -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
-	    -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
-	    -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
-	    -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
-	    -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
-	    -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
-	    -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
-	    -0x1.ff6fe1feb4e53p-9, 0.0,
-	    0x1.fe91885ec8e20p-8,  0x1.fc516f716296dp-7,
-	    0x1.7bb4dd70a015bp-6,  0x1.f84c99b34b674p-6,
-	    0x1.39f9ce4fb2d71p-5,  0x1.7756c0fd22e78p-5,
-	    0x1.b43ee82db8f3ap-5,  0x1.f0b3fced60034p-5,
-	    0x1.165bd78d4878ep-4,  0x1.3425d2715ebe6p-4,
-	    0x1.51b8bd91b7915p-4,  0x1.6f15632c76a47p-4,
-	    0x1.8c3c88ecbe503p-4,  0x1.a92ef077625dap-4,
-	    0x1.c5ed5745fa006p-4,  0x1.e27876de1c993p-4,
-	    0x1.fed104fce4cdcp-4,  0x1.0d7bd9c17d78bp-3,
-	    0x1.1b76986cef97bp-3,  0x1.295913d24f750p-3,
-	    0x1.37239fa295d17p-3,  0x1.44d68dd78714bp-3,
-	    0x1.52722ebe5d780p-3,  0x1.5ff6d12671f98p-3,
-	    0x1.6d64c2389484bp-3,  0x1.7abc4da40fddap-3,
-	    0x1.87fdbda1e8452p-3,  0x1.95295b06a5f37p-3,
-	    0x1.a23f6d34abbc5p-3,  0x1.af403a28e04f2p-3,
-	    0x1.bc2c06a85721ap-3,  0x1.c903161240163p-3,
-	    0x1.d5c5aa93287ebp-3,  0x1.e274051823fa9p-3,
-	    0x1.ef0e656300c16p-3,  0x1.fb9509f05aa2ap-3,
-	    0x1.04041821f37afp-2,  0x1.0a340a49b3029p-2,
-	    0x1.105a7918a126dp-2,  0x1.1677819812b84p-2,
-	    0x1.1c8b405b40c0ep-2,  0x1.2295d16cfa6b1p-2,
-	    0x1.28975066318a2p-2,  0x1.2e8fd855d86fcp-2,
-	    0x1.347f83d605e59p-2,  0x1.3a666d1244588p-2,
-	    0x1.4044adb6f8ec4p-2,  0x1.461a5f077558cp-2,
-	    0x1.4be799e20b9c8p-2,  0x1.51ac76a6b79dfp-2,
-	    0x1.57690d5744a45p-2,  0x1.5d1d758e45217p-2 }
+  .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
+	     { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
+	     { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
+	     { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
+	     { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
+	     { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
+	     { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
+	     { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
+	     { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
+	     { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
+	     { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
+	     { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
+	     { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
+	     { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
+	     { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
+	     { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
+	     { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
+	     { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
+	     { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
+	     { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
+	     { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
+	     { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
+	     { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
+	     { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
+	     { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
+	     { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
+	     { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
+	     { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
+	     { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
+	     { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
+	     { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
+	     { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
+	     { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
+	     { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
+	     { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
+	     { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
+	     { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
+	     { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
+	     { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
+	     { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
+	     { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
+	     { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
+	     { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
+	     { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
+	     { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
+	     { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
+	     { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
+	     { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
+	     { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
+	     { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
+	     { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
+	     { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
+	     { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
+	     { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
+	     { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
+	     { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
+	     { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
+	     { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
+	     { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
+	     { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
+	     { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
+	     { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
+	     { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
+	     { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
+	     { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
+	     { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
+	     { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
+	     { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
+	     { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
+	     { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
+	     { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
+	     { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
+	     { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
+	     { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
+	     { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
+	     { 1.0, 0.0 },
+	     { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
+	     { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
+	     { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
+	     { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
+	     { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
+	     { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
+	     { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
+	     { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
+	     { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
+	     { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
+	     { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
+	     { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
+	     { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
+	     { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
+	     { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
+	     { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
+	     { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
+	     { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
+	     { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
+	     { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
+	     { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
+	     { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
+	     { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
+	     { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
+	     { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
+	     { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
+	     { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
+	     { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
+	     { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
+	     { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
+	     { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
+	     { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
+	     { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
+	     { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
+	     { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
+	     { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
+	     { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
+	     { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
+	     { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
+	     { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
+	     { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
+	     { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
+	     { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
+	     { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
+	     { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
+	     { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
+	     { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
+	     { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
+	     { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
+	     { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
+	     { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
+	     { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
 };
diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
index 43efd8f99d..cfc87f8dd0 100644
--- a/sysdeps/aarch64/fpu/v_math.h
+++ b/sysdeps/aarch64/fpu/v_math.h
@@ -30,15 +30,15 @@
 #define V_NAME_D2(fun) _ZGVnN2vv_##fun
 
 /* Shorthand helpers for declaring constants.  */
-#define V2(x)                                                                  \
-  {                                                                            \
-    x, x                                                                       \
-  }
+#define V2(X) { X, X }
+#define V4(X) { X, X, X, X }
+#define V8(X) { X, X, X, X, X, X, X, X }
 
-#define V4(x)                                                                  \
-  {                                                                            \
-    x, x, x, x                                                                 \
-  }
+static inline int
+v_any_u16h (uint16x4_t x)
+{
+  return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
+}
 
 static inline float32x4_t
 v_f32 (float x)
@@ -63,6 +63,11 @@ v_any_u32 (uint32x4_t x)
   /* assume elements in x are either 0 or -1u.  */
   return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
 }
+static inline int
+v_any_u32h (uint32x2_t x)
+{
+  return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
+}
 static inline float32x4_t
 v_lookup_f32 (const float *tab, uint32x4_t idx)
 {
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
index e7d30b477f..0abfd8b701 100644
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -42,8 +42,10 @@ extern const struct v_log_data
   /* Shared data for vector log and log-derived routines (e.g. asinh).  */
   double poly[V_LOG_POLY_ORDER - 1];
   double ln2;
-  double invc[1 << V_LOG_TABLE_BITS];
-  double logc[1 << V_LOG_TABLE_BITS];
+  struct
+  {
+    double invc, logc;
+  } table[1 << V_LOG_TABLE_BITS];
 } __v_log_data attribute_hidden;
 
 #define V_EXP_TABLE_BITS 7

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-10-05 15:57 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-05 15:57 [glibc] aarch64: Optimise vecmath logs Szabolcs Nagy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).