public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* Emit square root using the Newton series
@ 2016-03-17 22:50 Evandro Menezes
  2016-03-24 20:30 ` [AArch64] " Evandro Menezes
  0 siblings, 1 reply; 38+ messages in thread
From: Evandro Menezes @ 2016-03-17 22:50 UTC (permalink / raw)
  To: GCC Patches
  Cc: James Greenhalgh, Wilco Dijkstra, Andrew Pinski, philipp.tomsich,
	Benedikt Huber

[-- Attachment #1: Type: text/plain, Size: 1123 bytes --]

    2016-03-16  Evandro Menezes <e.menezes@samsung.com>
                 Wilco Dijkstra  <wilco.dijkstra@arm.com>

    gcc/
         * config/aarch64/aarch64-tuning-flags.def
         (AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
         * config/aarch64/aarch64-protos.h
         (aarch64_emit_approx_rsqrt): Replace with
    "aarch64_emit_approx_sqrt".
         (AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
         * config/aarch64/aarch64.c
         (exynosm1_tunings): Use the new macro.
         (aarch64_emit_approx_sqrt): Define new function.
         * config/aarch64/aarch64.md
         (rsqrt<mode>2): Use new function instead.
         (sqrt<mode>2): New expansion and insn definitions.
         * config/aarch64/aarch64-simd.md: Likewise.
         * config/aarch64/aarch64.opt
         (mlow-precision-recip-sqrt): Expand option description.
         * doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.


This patch refactors the function to emit the reciprocal square root 
approximation to also emit the square root approximation.

Feedback is welcome.

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: 0001-Emit-square-root-using-the-Newton-series.patch --]
[-- Type: text/x-patch, Size: 11309 bytes --]

From 8d00622b90fa414df605011446ac058efe867cf6 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 17 Mar 2016 17:39:55 -0500
Subject: [PATCH] Emit square root using the Newton series

2016-03-17  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <wilco.dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
	* config/aarch64/aarch64-protos.h
	(aarch64_emit_approx_rsqrt): Replace with "aarch64_emit_approx_sqrt".
	(AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
	* config/aarch64/aarch64.c
	(exynosm1_tunings): Use the new macro.
	(aarch64_emit_approx_sqrt): Define new function.
	* config/aarch64/aarch64.md
	(rsqrt<mode>2): Use new function instead.
	(sqrt<mode>2): New expansion and insn definitions.
	* config/aarch64/aarch64-simd.md: Likewise.
	* config/aarch64/aarch64.opt
	(mlow-precision-recip-sqrt): Expand option description.
	* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h         |  5 +-
 gcc/config/aarch64/aarch64-simd.md          | 27 +++++++-
 gcc/config/aarch64/aarch64-tuning-flags.def |  3 +-
 gcc/config/aarch64/aarch64.c                | 97 +++++++++++++++++++++++------
 gcc/config/aarch64/aarch64.md               | 25 +++++++-
 gcc/config/aarch64/aarch64.opt              |  4 +-
 gcc/doc/invoke.texi                         |  9 +--
 7 files changed, 139 insertions(+), 31 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index dced209..3f3ae1c 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags
 };
 #undef AARCH64_EXTRA_TUNING_OPTION
 
+#define AARCH64_EXTRA_TUNE_APPROX_SQRT \
+  (AARCH64_EXTRA_TUNE_APPROX_SQRT_DF | AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)
+
 extern struct tune_params aarch64_tune_params;
 
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
@@ -361,7 +364,7 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+void aarch64_emit_approx_sqrt (rtx, rtx, bool);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..31191bb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@
 		     UNSPEC_RSQRT))]
   "TARGET_SIMD"
 {
-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
   DONE;
 })
 
@@ -4307,7 +4307,30 @@
 
 ;; sqrt
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VDQF 0 "register_operand")
+	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+  if (flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations
+      && !optimize_function_for_size_p (cfun)
+      && ((mode == SFmode
+           && (aarch64_tune_params.extra_tuning_flags
+               & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+          || (mode == DFmode
+              && (aarch64_tune_params.extra_tuning_flags
+                  & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+    {
+      aarch64_emit_approx_sqrt (operands[0], operands[1], false);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
         (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..725a79c 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,5 @@
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
 AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrt", APPROX_SQRT_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrtf", APPROX_SQRT_SF)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ed0daa5..04f5633 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
 #include "recog.h"
 #include "diagnostic.h"
 #include "insn-attr.h"
+#include "insn-flags.h"
 #include "alias.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -7498,46 +7499,102 @@ get_rsqrts_type (machine_mode mode)
   }
 }
 
-/* Emit instruction sequence to compute the reciprocal square root using the
-   Newton-Raphson series.  Iterate over the series twice for SF
-   and thrice for DF.  */
+/* Emit instruction sequence to compute either the approximate square root
+   or its approximate reciprocal.  */
 
 void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
 {
   machine_mode mode = GET_MODE (src);
-  gcc_assert (
-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
-	|| mode == DFmode || mode == V2DFmode);
+  machine_mode mmsk;
+
+  gcc_assert (GET_MODE_INNER (mode) == SFmode
+              || GET_MODE_INNER (mode) == DFmode);
 
   rtx xsrc = gen_reg_rtx (mode);
   emit_move_insn (xsrc, src);
-  rtx x0 = gen_reg_rtx (mode);
 
-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+  rtx xcc, xne, xmsk;
+  bool scalar = !VECTOR_MODE_P (mode);
+  if (!recp)
+    {
+      if (scalar)
+	{
+	  /* Compare argument with 0.0 and set the CC.  */
+	  xcc = aarch64_gen_compare_reg (NE, xsrc, CONST0_RTX (mode));
+	  xne = gen_rtx_NE (VOIDmode, xcc, const0_rtx);
+	}
+      else
+	{
+	  /* Compare the argument with 0.0 and create a vector mask.  */
+	  mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+				  GET_MODE_NUNITS (mode));
+	  xmsk = gen_reg_rtx (mmsk);
+	  switch (mode)
+	  {
+	    case V2SFmode:
+	      emit_insn (gen_aarch64_cmeqv2sf (xmsk, xsrc, CONST0_RTX (mode)));
+	      break;
 
-  bool double_mode = (mode == DFmode || mode == V2DFmode);
+	    case V4SFmode:
+	      emit_insn (gen_aarch64_cmeqv4sf (xmsk, xsrc, CONST0_RTX (mode)));
+	      break;
 
-  int iterations = double_mode ? 3 : 2;
+	    case V2DFmode:
+	      emit_insn (gen_aarch64_cmeqv2df (xmsk, xsrc, CONST0_RTX (mode)));
+	      break;
 
-  /* Optionally iterate over the series one less time than otherwise.  */
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+    }
+
+  /* Estimate the approximate reciprocal square root.  */
+  rtx xdst = gen_reg_rtx (mode);
+  emit_insn ((*get_rsqrte_type (mode)) (xdst, xsrc));
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance
+     while sacrificing the accuracy.  */
   if (flag_mrecip_low_precision_sqrt)
     iterations--;
 
-  for (int i = 0; i < iterations; ++i)
+  /* Iterate over the series.  */
+  while (iterations--)
     {
-      rtx x1 = gen_reg_rtx (mode);
       rtx x2 = gen_reg_rtx (mode);
-      rtx x3 = gen_reg_rtx (mode);
-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+
+      rtx x1 = gen_reg_rtx (mode);
+      emit_insn ((*get_rsqrts_type (mode)) (x1, xsrc, x2));
 
-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+      emit_set_insn (xdst, gen_rtx_MULT (mode, x1, xdst));
+    }
+
+  if (!recp)
+    {
+      /* Qualify the final estimate for the approximate reciprocal square root
+	 when the argument is 0.0.  */
+      if (scalar)
+	/* Conditionally set the final estimate to 0.0.  */
+	emit_set_insn (xdst, gen_rtx_IF_THEN_ELSE (mode, xne, xdst, xsrc));
+      else
+	{
+	  /* Mask off any final vector element estimate to 0.0.  */
+	  rtx xtmp = gen_reg_rtx (mmsk);
+	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+					    gen_rtx_SUBREG (mmsk, xdst, 0)));
+	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+	}
 
-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-      x0 = x1;
+      /* Calculate the approximate square root.  */
+      emit_set_insn (xdst, gen_rtx_MULT (mode, xsrc, xdst));
     }
 
-  emit_move_insn (dst, x0);
+  emit_move_insn (dst, xdst);
 }
 
 /* Return the number of instructions that can be issued per cycle.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..71725e7 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,7 +4665,30 @@
   [(set_attr "type" "ffarith<s>")]
 )
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF 0 "register_operand")
+        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+  if (flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations
+      && !optimize_function_for_size_p (cfun)
+      && ((mode == SFmode
+           && (aarch64_tune_params.extra_tuning_flags
+               & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+          || (mode == DFmode
+              && (aarch64_tune_params.extra_tuning_flags
+                  & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+    {
+      aarch64_emit_approx_sqrt (operands[0], operands[1], false);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w")
         (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
   "TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..c5e7fc9 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,5 @@ PC relative literal loads.
 
 mlow-precision-recip-sqrt
 Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 99ac11b..d48c29b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12903,10 +12903,11 @@ corresponding flag to the linker.
 @item -mno-low-precision-recip-sqrt
 @opindex -mlow-precision-recip-sqrt
 @opindex -mno-low-precision-recip-sqrt
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
-This is only relevant if @option{-ffast-math} enables the reciprocal square root
-approximation, which in turn depends on the target processor.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables
+the approximate square root or its approximate reciprocal,
+which in turn depends on the target processor.
 
 @item -march=@var{name}
 @opindex march
-- 
1.9.1


^ permalink raw reply	[flat|nested] 38+ messages in thread
[parent not found: <AM3PR08MB00886499882773F3C8B9F71D83B30@AM3PR08MB0088.eurprd08.prod.outlook.com>]
* [AArch64] Emit square root using the Newton series
@ 2015-12-08 21:35 Evandro Menezes
  2015-12-09 14:05 ` Marcus Shawcroft
                   ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Evandro Menezes @ 2015-12-08 21:35 UTC (permalink / raw)
  To: GCC Patches, Marcus Shawcroft, James Greenhalgh, Andrew Pinski,
	Benedikt Huber, philipp.tomsich

[-- Attachment #1: Type: text/plain, Size: 1041 bytes --]

    Emit square root using the Newton series

    2015-12-03  Evandro Menezes  <e.menezes@samsung.com>

    gcc/
             * config/aarch64/aarch64-protos.h (aarch64_emit_swsqrt):
    Declare new
             function.
             * config/aarch64/aarch64-simd.md (sqrt<mode>2): New
    expansion and
             insn definitions.
             * config/aarch64/aarch64-tuning-flags.def
             (AARCH64_EXTRA_TUNE_FAST_SQRT): New tuning macro.
             * config/aarch64/aarch64.c (aarch64_emit_swsqrt): Define
    new function.
             * config/aarch64/aarch64.md (sqrt<mode>2): New expansion
    and insn
             definitions.
             * config/aarch64/aarch64.opt (mlow-precision-recip-sqrt):
    Expand option
             description.
             * doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.

This patch extends the patch that added support for implementing x^-1/2 
using the Newton series by adding support for x^1/2 as well.

Is it OK at this point of stage 3?

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: 0001-Emit-square-root-using-the-Newton-series.patch --]
[-- Type: text/x-patch, Size: 7350 bytes --]

From f173dace7b4137f8868a1a6ef9cdbbeefa92ffde Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 3 Dec 2015 15:25:07 -0600
Subject: [PATCH] Emit square root using the Newton series

2015-12-03  Evandro Menezes  <e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_emit_swsqrt): Declare new
	function.
	* config/aarch64/aarch64-simd.md (sqrt<mode>2): New expansion and
	insn definitions.
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_FAST_SQRT): New tuning macro.
	* config/aarch64/aarch64.c (aarch64_emit_swsqrt): Define new function.
	* config/aarch64/aarch64.md (sqrt<mode>2): New expansion and insn
	definitions.
	* config/aarch64/aarch64.opt (mlow-precision-recip-sqrt): Expand option
	description.
	* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h         |  1 +
 gcc/config/aarch64/aarch64-simd.md          | 18 +++++++++++++++++-
 gcc/config/aarch64/aarch64-tuning-flags.def |  2 +-
 gcc/config/aarch64/aarch64.c                | 25 +++++++++++++++++++++++--
 gcc/config/aarch64/aarch64.md               | 18 +++++++++++++++++-
 gcc/config/aarch64/aarch64.opt              |  2 +-
 gcc/doc/invoke.texi                         | 13 ++++++-------
 7 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 1e0fb4e..7fe6074 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -356,6 +356,7 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
 void aarch64_emit_swrsqrt (rtx, rtx);
+void aarch64_emit_swsqrt (rtx, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 030a101..f6d2da4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4280,7 +4280,23 @@
 
 ;; sqrt
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VDQF 0 "register_operand")
+	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if ((AARCH64_EXTRA_TUNE_FAST_SQRT & aarch64_tune_params.extra_tuning_flags)
+      && !optimize_function_for_size_p (cfun)
+      && flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      aarch64_emit_swsqrt (operands[0], operands[1]);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
         (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 6f7dbce..11c6c9a 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,4 @@
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
 AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("fast_sqrt", FAST_SQRT)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ae4cfb3..3b58c35 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -533,8 +533,9 @@ static const struct tune_params exynosm1_tunings =
   2,	/* min_div_recip_mul_df.  */
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
-  tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT
+   | AARCH64_EXTRA_TUNE_FAST_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -7515,6 +7516,26 @@ aarch64_emit_swrsqrt (rtx dst, rtx src)
   emit_move_insn (dst, x0);
 }
 
+/* Emit instruction sequence to compute the approximate square root.  */
+
+void
+aarch64_emit_swsqrt (rtx dst, rtx src)
+{
+  machine_mode mode = GET_MODE (src);
+  gcc_assert (mode == SFmode || mode == V2SFmode || mode == V4SFmode
+	      || mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+
+  rtx xdst = gen_reg_rtx (mode);
+
+  /* Calculate the approximate square root by multiplying the original operand
+     by its approximate reciprocal square root.  */
+  aarch64_emit_swrsqrt (xdst, xsrc);
+  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, src));
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d9fe1ae..d5930b9 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4534,7 +4534,23 @@
   [(set_attr "type" "ffarith<s>")]
 )
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF 0 "register_operand")
+	(sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if ((AARCH64_EXTRA_TUNE_FAST_SQRT & aarch64_tune_params.extra_tuning_flags)
+      && !optimize_function_for_size_p (cfun)
+      && flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      aarch64_emit_swsqrt (operands[0], operands[1]);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w")
         (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
   "TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a0fbfd42..d02c5e8 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,5 @@ PC relative literal loads.
 
 mlow-precision-recip-sqrt
 Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating a sqrt approximation, run fewer steps.
+Calculate the square-root or its reciprocal approximation in fewer steps.
 This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5ab565c..f4a47a6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -6141,7 +6141,7 @@ is usable even in freestanding environments.
 @opindex fsanitize-coverage=trace-pc
 Enable coverage-guided fuzzing code instrumentation.
 Inserts call to __sanitizer_cov_trace_pc into every basic block.
-
+-
 @item -fcheck-pointer-bounds
 @opindex fcheck-pointer-bounds
 @opindex fno-check-pointer-bounds
@@ -12561,12 +12561,11 @@ corresponding flag to the linker.
 @item -mno-low-precision-recip-sqrt
 @opindex -mlow-precision-recip-sqrt
 @opindex -mno-low-precision-recip-sqrt
-The square root estimate uses two steps instead of three for double-precision,
-and one step instead of two for single-precision.
-Thus reducing latency and precision.
-This is only relevant if @option{-ffast-math} activates
-reciprocal square root estimate instructions.
-Which in turn depends on the target processor.
+The square root and its reciprocal approximation use one step less than
+otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables
+the square root or its reciprocal approximation,
+which in turn depends on the target processor.
 
 @item -march=@var{name}
 @opindex march
-- 
1.9.1


^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2016-04-27 15:45 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-17 22:50 Emit square root using the Newton series Evandro Menezes
2016-03-24 20:30 ` [AArch64] " Evandro Menezes
2016-04-01 22:45   ` Evandro Menezes
2016-04-04 16:32     ` Evandro Menezes
     [not found]       ` <DB3PR08MB008902F0F0AFA3B1F1C91511839E0@DB3PR08MB0089.eurprd08.prod.outlook.com>
2016-04-05 22:30         ` Evandro Menezes
2016-04-12 18:15           ` Evandro Menezes
2016-04-21 18:44             ` Evandro Menezes
2016-04-27 14:24             ` James Greenhalgh
2016-04-27 15:45               ` Evandro Menezes
     [not found] <AM3PR08MB00886499882773F3C8B9F71D83B30@AM3PR08MB0088.eurprd08.prod.outlook.com>
     [not found] ` <011d01d17a26$31b3ade0$951b09a0$@samsung.com>
2016-03-10 16:52   ` Wilco Dijkstra
2016-03-10 16:58     ` Evandro Menezes
2016-03-10 19:10       ` Wilco Dijkstra
2016-03-10 22:15         ` Evandro Menezes
2016-03-11  1:06           ` Wilco Dijkstra
2016-03-14 16:39             ` Evandro Menezes
2016-03-14 19:13               ` Wilco Dijkstra
2016-03-16 21:44             ` Evandro Menezes
  -- strict thread matches above, loose matches on Subject: below --
2015-12-08 21:35 Evandro Menezes
2015-12-09 14:05 ` Marcus Shawcroft
2015-12-09 16:31   ` Evandro Menezes
2015-12-09 16:52 ` Kyrill Tkachov
2015-12-09 16:59   ` Evandro Menezes
2015-12-09 17:03     ` Kyrill Tkachov
2015-12-09 17:16       ` Kyrill Tkachov
2015-12-09 18:50         ` Evandro Menezes
2015-12-10 10:30           ` Kyrill Tkachov
2016-02-23  0:50             ` Evandro Menezes
2016-02-26 15:00               ` James Greenhalgh
2016-02-26 23:42                 ` Evandro Menezes
2016-02-26 23:46                   ` Evandro Menezes
2016-02-16 20:56 ` Evandro Menezes
2016-03-04  0:22   ` Evandro Menezes
2016-03-08 22:08     ` Evandro Menezes
2016-03-08 22:18       ` Evandro Menezes
2016-03-08 22:20         ` Evandro Menezes
2016-03-16 19:45       ` Evandro Menezes
2016-03-17 14:55         ` James Greenhalgh
2016-03-17 16:25           ` Evandro Menezes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).