public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86-64: Impelment __ieee754_remainder with static rounding
@ 2021-10-15  2:52 Wangyang Guo
  2021-10-18 18:00 ` Joseph Myers
  0 siblings, 1 reply; 6+ messages in thread
From: Wangyang Guo @ 2021-10-15  2:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: Wangyang Guo

__ieee754_remainder sets and resets MXCSR register to use a specific
rounding mode.  Save, set and restore MXCSR register has overhead.
Use AVX512 static rounding to override MXCSR-based rounding control
for floating point instructions with rounding semantics.  It can
improve the performance of 2 indicators in bench-math-inlines:
* remainder_test1.normal: +18.9%
* remainder_test2.normal: +19.4%

Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
---
 sysdeps/ieee754/dbl-64/e_remainder.c          | 105 +++++++++++-------
 .../ieee754/dbl-64/remainder-round-dbl-64.h   |  42 +++++++
 sysdeps/x86_64/fpu/multiarch/Makefile         |   4 +
 .../x86_64/fpu/multiarch/e_remainder-avx512.c |  20 ++++
 sysdeps/x86_64/fpu/multiarch/e_remainder.c    |  32 ++++++
 sysdeps/x86_64/fpu/multiarch/ifunc-avx512f.h  |  32 ++++++
 .../fpu/multiarch/remainder-round-dbl-64.h    |  66 +++++++++++
 7 files changed, 261 insertions(+), 40 deletions(-)
 create mode 100644 sysdeps/ieee754/dbl-64/remainder-round-dbl-64.h
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_remainder-avx512.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_remainder.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-avx512f.h
 create mode 100644 sysdeps/x86_64/fpu/multiarch/remainder-round-dbl-64.h

diff --git a/sysdeps/ieee754/dbl-64/e_remainder.c b/sysdeps/ieee754/dbl-64/e_remainder.c
index d076a37168..257e41ae7a 100644
--- a/sysdeps/ieee754/dbl-64/e_remainder.c
+++ b/sysdeps/ieee754/dbl-64/e_remainder.c
@@ -36,6 +36,7 @@
 #include <math_private.h>
 #include <fenv_private.h>
 #include <libm-alias-finite.h>
+#include <remainder-round-dbl-64.h>
 
 /**************************************************************************/
 /* An ultimate remainder routine. Given two IEEE double machine numbers x */
@@ -44,9 +45,8 @@
 double
 __ieee754_remainder (double x, double y)
 {
-  double z, d, xx;
   int4 kx, ky, n, nn, n1, m1, l;
-  mynumber u, t, w = { { 0, 0 } }, v = { { 0, 0 } }, ww = { { 0, 0 } }, r;
+  mynumber u, t;
   u.x = x;
   t.x = y;
   kx = u.i[HIGH_HALF] & 0x7fffffff; /* no sign  for x*/
@@ -55,65 +55,88 @@ __ieee754_remainder (double x, double y)
   /*------ |x| < 2^1023  and   2^-970 < |y| < 2^1024 ------------------*/
   if (kx < 0x7fe00000 && ky < 0x7ff00000 && ky >= 0x03500000)
     {
-      SET_RESTORE_ROUND_NOEX (FE_TONEAREST);
+      SET_ENV (FE_TONEAREST);
+      MYNUMBER_R_TYPE r;
+      MYNUMBER_R_DECL_VAL (u_r, u);
+      MYNUMBER_R_DECL_VAL (t_r, t);
+      MYNUMBER_R_DECL_ZERO (w);
+      MYNUMBER_R_DECL_ZERO (v);
+      MYNUMBER_R_DECL_ZERO (ww);
+      DOUBLE_R_TYPE z_r, d_r, xx;
+      DOUBLE_R_DECL_VAL (x_r, x);
+
       if (kx + 0x00100000 < ky)
 	return x;
       if ((kx - 0x01500000) < ky)
 	{
-	  z = x / t.x;
-	  v.i[HIGH_HALF] = t.i[HIGH_HALF];
-	  d = (z + big.x) - big.x;
-	  xx = (x - d * v.x) - d * (t.x - v.x);
-	  if (d - z != 0.5 && d - z != -0.5)
-	    return (xx != 0) ? xx : ((x > 0) ? ZERO.x : nZERO.x);
+	  z_r = DIV_RN (x_r, t_r.x);
+	  COPY_HIGH_HALF (v, t_r);
+	  d_r = ROUND_TO_ZERO (z_r);
+	  xx = FNMADD_RN (d_r, SUB_RN (t_r.x, v.x),
+			  FNMADD_RN (d_r, v.x, x_r));
+	  if (IS_NEQ (ABS (SUB_RN (d_r, z_r)), 0.5))
+	    return (IS_NEQ (xx, 0)
+		    ? TO_FP64 (xx)
+		    : ((x > 0) ? ZERO.x : nZERO.x));
 	  else
 	    {
-	      if (fabs (xx) > 0.5 * t.x)
-		return (z > d) ? xx - t.x : xx + t.x;
+	      if (IS_GT (ABS (xx), MUL_RN (TO_V (0.5), t_r.x)))
+		return (IS_GT (z_r, d_r)
+			? TO_FP64 (SUB_RN (xx, t_r.x))
+			: TO_FP64 (ADD_RN (xx, t_r.x)));
 	      else
-		return xx;
+		return TO_FP64 (xx);
 	    }
 	} /*    (kx<(ky+0x01500000))         */
       else
 	{
-	  r.x = 1.0 / t.x;
-	  n = t.i[HIGH_HALF];
+	  r.x = DIV_RN (TO_V (1.0), t_r.x);
+	  n = GET_HIGH_HALF (t_r);
 	  nn = (n & 0x7ff00000) + 0x01400000;
-	  w.i[HIGH_HALF] = n;
-	  ww.x = t.x - w.x;
+	  SET_HIGH_HALF (w, n);
+	  ww.x = SUB_RN (t_r.x, w.x);
 	  l = (kx - nn) & 0xfff00000;
-	  n1 = ww.i[HIGH_HALF];
-	  m1 = r.i[HIGH_HALF];
+	  n1 = GET_HIGH_HALF (ww);
+	  m1 = GET_HIGH_HALF (r);
 	  while (l > 0)
 	    {
-	      r.i[HIGH_HALF] = m1 - l;
-	      z = u.x * r.x;
-	      w.i[HIGH_HALF] = n + l;
-	      ww.i[HIGH_HALF] = (n1) ? n1 + l : n1;
-	      d = (z + big.x) - big.x;
-	      u.x = (u.x - d * w.x) - d * ww.x;
-	      l = (u.i[HIGH_HALF] & 0x7ff00000) - nn;
+	      SET_HIGH_HALF (r, (m1 - l));
+	      z_r = MUL_RN (u_r.x, r.x);
+	      SET_HIGH_HALF (w, (n + l));
+	      SET_HIGH_HALF (ww, ((n1) ? n1 + l: n1));
+	      d_r = ROUND_TO_ZERO (z_r);
+	      u_r.x = SUB_RN (FNMADD_RN (d_r, w.x, u_r.x),
+			      MUL_RN (d_r, ww.x));
+	      l = (GET_HIGH_HALF (u_r) & 0x7ff00000) - nn;
 	    }
-	  r.i[HIGH_HALF] = m1;
-	  w.i[HIGH_HALF] = n;
-	  ww.i[HIGH_HALF] = n1;
-	  z = u.x * r.x;
-	  d = (z + big.x) - big.x;
-	  u.x = (u.x - d * w.x) - d * ww.x;
-	  if (fabs (u.x) < 0.5 * t.x)
-	    return (u.x != 0) ? u.x : ((x > 0) ? ZERO.x : nZERO.x);
-	  else
-	  if (fabs (u.x) > 0.5 * t.x)
-	    return (d > z) ? u.x + t.x : u.x - t.x;
+	  SET_HIGH_HALF (r, m1);
+	  SET_HIGH_HALF (w, n);
+	  SET_HIGH_HALF (ww, n1);
+	  z_r = MUL_RN (u_r.x, r.x);
+	  d_r = ROUND_TO_ZERO (z_r);
+	  u_r.x = SUB_RN (FNMADD_RN (d_r, w.x, u_r.x),
+			  MUL_RN (d_r, ww.x));
+	  if (IS_LT (ABS (u_r.x), MUL_RN (TO_V (0.5), t_r.x)))
+	    return (IS_NEQ (u_r.x, 0)
+		    ? TO_FP64 (u_r.x)
+		    : ((x > 0) ? ZERO.x : nZERO.x));
 	  else
-	    {
-	      z = u.x / t.x; d = (z + big.x) - big.x;
-              return ((u.x - d * w.x) - d * ww.x);
-	    }
+	    if (IS_GT (ABS (u_r.x), MUL_RN (TO_V (0.5), t_r.x)))
+	      return (IS_GT (d_r, z_r)
+		      ? TO_FP64 (ADD_RN (u_r.x, t_r.x))
+		      : TO_FP64 (SUB_RN (u_r.x, t_r.x)));
+	    else
+	      {
+		z_r = DIV_RN (u_r.x, t_r.x);
+		d_r = ROUND_TO_ZERO (z_r);
+		return TO_FP64 (SUB_RN (FNMADD_RN (d_r, w.x, u_r.x),
+					MUL_RN (d_r, ww.x)));
+	      }
 	}
     } /*   (kx<0x7fe00000&&ky<0x7ff00000&&ky>=0x03500000)     */
   else
     {
+      double z, d;
       if (kx < 0x7fe00000 && ky < 0x7ff00000 && (ky > 0 || t.i[LOW_HALF] != 0))
 	{
 	  y = fabs (y) * t128.x;
@@ -150,4 +173,6 @@ __ieee754_remainder (double x, double y)
 	}
     }
 }
+#ifndef __ieee754_remainder
 libm_alias_finite (__ieee754_remainder, __remainder)
+#endif
diff --git a/sysdeps/ieee754/dbl-64/remainder-round-dbl-64.h b/sysdeps/ieee754/dbl-64/remainder-round-dbl-64.h
new file mode 100644
index 0000000000..2fc2558cb8
--- /dev/null
+++ b/sysdeps/ieee754/dbl-64/remainder-round-dbl-64.h
@@ -0,0 +1,42 @@
+/* Used by remainder functions.  Generic version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SET_ENV(RM) SET_RESTORE_ROUND_NOEX (RM)
+
+#define DOUBLE_R_TYPE               double
+#define DOUBLE_R_DECL_VAL(v_r, v)   double v_r = (v)
+#define MYNUMBER_R_TYPE             mynumber
+#define MYNUMBER_R_DECL_VAL(v_r, v) mynumber v_r = (v)
+#define MYNUMBER_R_DECL_ZERO(v_r)   mynumber v_r = { { 0, 0 } }
+
+#define COPY_HIGH_HALF(a, b)        (a).i[HIGH_HALF] = (b).i[HIGH_HALF]
+#define SET_HIGH_HALF(a, b)         (a).i[HIGH_HALF] = (b)
+#define GET_HIGH_HALF(a)            (a).i[HIGH_HALF]
+
+#define ROUND_TO_ZERO(a)            (((a) + big.x) - big.x)
+#define ADD_RN(a, b)                ((a) + (b))
+#define SUB_RN(a, b)                ((a) - (b))
+#define MUL_RN(a, b)                ((a) * (b))
+#define DIV_RN(a, b)                ((a) / (b))
+#define FNMADD_RN(a, b, c)          ((c) - (a) * (b))
+#define IS_NEQ(a, b)                ((a) != (b))
+#define IS_LT(a, b)                 ((a) < (b))
+#define IS_GT(a, b)                 ((a) > (b))
+#define TO_FP64(a)                  ((double) (a))
+#define TO_V(a)                     ((double) (a))
+#define ABS(a)                      (fabs (a))
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index d425ffd6d3..06c61241a0 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -56,6 +56,10 @@ CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
+
+libm-sysdep_routines += e_remainder-avx512
+
+CFLAGS-e_remainder-avx512.c = -mavx512f
 endif
 
 ifeq ($(subdir),mathvec)
diff --git a/sysdeps/x86_64/fpu/multiarch/e_remainder-avx512.c b/sysdeps/x86_64/fpu/multiarch/e_remainder-avx512.c
new file mode 100644
index 0000000000..5852a19794
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_remainder-avx512.c
@@ -0,0 +1,20 @@
+/* AVX512F version of IEEE 754 remainder.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define __ieee754_remainder __ieee754_remainder_avx512f
+#include <sysdeps/ieee754/dbl-64/e_remainder.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_remainder.c b/sysdeps/x86_64/fpu/multiarch/e_remainder.c
new file mode 100644
index 0000000000..e1462438e3
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_remainder.c
@@ -0,0 +1,32 @@
+/* Multiple versions of IEEE 754 remainder.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <libm-alias-finite.h>
+
+extern double __redirect_ieee754_remainder (double, double);
+
+#define SYMBOL_NAME ieee754_remainder
+#include "ifunc-avx512f.h"
+
+libc_ifunc_redirected (__redirect_ieee754_remainder,
+		       __ieee754_remainder, IFUNC_SELECTOR ());
+libm_alias_finite (__ieee754_remainder, __remainder)
+
+#define __ieee754_remainder __ieee754_remainder_sse2
+#include <sysdeps/ieee754/dbl-64/e_remainder.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-avx512f.h b/sysdeps/x86_64/fpu/multiarch/ifunc-avx512f.h
new file mode 100644
index 0000000000..1f7b1d1ce6
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-avx512f.h
@@ -0,0 +1,32 @@
+/* Common definition for ifunc selections optimized with AVX512F.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512f) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
+    return OPTIMIZE (avx512f);
+
+  return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/fpu/multiarch/remainder-round-dbl-64.h b/sysdeps/x86_64/fpu/multiarch/remainder-round-dbl-64.h
new file mode 100644
index 0000000000..74301ee5f0
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/remainder-round-dbl-64.h
@@ -0,0 +1,66 @@
+/* Used by remainder functions.  AVX512F version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef __AVX512F__
+# include <x86intrin.h>
+
+typedef union MM_Number
+{
+  __m128d x;
+  __m128i i;
+} MM_Number;
+
+# define AVX512_RN (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)
+# define SET_ENV(RM)
+
+# define DOUBLE_R_TYPE               __m128d
+# define DOUBLE_R_DECL_VAL(v_r, v)   __m128d v_r = _mm_set_sd (v)
+# define MYNUMBER_R_TYPE             MM_Number
+# define MYNUMBER_R_DECL_VAL(v_r, v) \
+  MM_Number v_r = { __extension__ (__m128d) { (v).x, 0.0 } }
+# define MYNUMBER_R_DECL_ZERO(v_r) \
+  MM_Number v_r = { __extension__ (__m128d) { 0.0, 0.0 } }
+
+# define COPY_HIGH_HALF(a, b) \
+  (a).i = _mm_blend_epi32 ((a).i, (b).i, 0x1)
+# define SET_HIGH_HALF(a, b) \
+  (a).i = _mm_insert_epi32 ((a).i, b, 0x1)
+# define GET_HIGH_HALF(a) _mm_extract_epi32 ((a).i, 0x1)
+
+# define ROUND_TO_ZERO(a) _mm_round_pd ((a), AVX512_RN)
+# define ADD_RN(a, b) _mm_add_round_sd ((a), (b), AVX512_RN)
+# define SUB_RN(a, b) _mm_sub_round_sd ((a), (b), AVX512_RN)
+# define MUL_RN(a, b) _mm_mul_round_sd ((a), (b), AVX512_RN)
+# define DIV_RN(a, b) _mm_div_round_sd ((a), (b), AVX512_RN)
+# define FNMADD_RN(a, b, c) _mm_fnmadd_round_sd ((a), (b), (c), AVX512_RN)
+
+/* b is not a vector data type */
+# define IS_NEQ(a, b) (_mm_cvtsd_f64 (a) != b)
+/* b is a vector data type */
+# define IS_LT(a, b) _mm_cmp_sd_mask ((a), (b), _CMP_LT_OS)
+/* b is a vector data type */
+# define IS_GT(a, b) _mm_cmp_sd_mask ((a), (b), _CMP_GT_OS)
+# define TO_FP64(a) _mm_cvtsd_f64 (a)
+# define TO_V(a) _mm_set_sd (a)
+# define ABS(a) \
+  ((__m128d) _mm_and_si128 ((__m128i) (a), \
+			    _mm_cvtsi64_si128 (0x7fffffffffffffffll)))
+
+#else
+# include <sysdeps/ieee754/dbl-64/remainder-round-dbl-64.h>
+#endif
-- 
2.28.0


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-10-22  8:07 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-15  2:52 [PATCH] x86-64: Impelment __ieee754_remainder with static rounding Wangyang Guo
2021-10-18 18:00 ` Joseph Myers
2021-10-18 21:55   ` Andrew Waterman
2021-10-22  6:07   ` Guo, Wangyang
2021-10-22  7:15     ` Paul Zimmermann
2021-10-22  8:06       ` Guo, Wangyang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).