public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [AArch64] Emit division using the Newton series
@ 2016-03-17 21:14 Evandro Menezes
  2016-03-23 16:24 ` Evandro Menezes
  2016-03-23 16:25 ` Evandro Menezes
  0 siblings, 2 replies; 16+ messages in thread
From: Evandro Menezes @ 2016-03-17 21:14 UTC (permalink / raw)
  To: GCC Patches; +Cc: James Greenhalgh, Wilco Dijkstra, Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 899 bytes --]

         Emit division using the Newton series

         2016-03-17  Evandro Menezes  <e.menezes@samsung.com>

         gcc/
             * config/aarch64/aarch64-tuning-flags.def
             (AARCH64_EXTRA_TUNE_APPROX_DIV_{SF,DF}: New tuning macros.
             * config/aarch64/aarch64-protos.h
             (AARCH64_EXTRA_TUNE_APPROX_DIV): New macro.
             (aarch64_emit_approx_div): Declare new function.
             * config/aarch64/aarch64.c
             (aarch64_emit_approx_div): Define new function.
             * config/aarch64/aarch64.md ("div<mode>3"): New expansion.
             * config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.

This patch implements FP division by an approximation using the Newton 
series.

With this patch, DF division is sped up by over 100% and SF division, 
zilch, both on A57 and on M1.

Feedback welcome.

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: 0001-Emit-division-using-the-Newton-series.patch --]
[-- Type: text/x-patch, Size: 7998 bytes --]

From 750bd4f64cea8787eb077b7537cc7d8dceafac57 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 17 Mar 2016 14:44:55 -0500
Subject: [PATCH] Emit division using the Newton series

2016-03-17  Evandro Menezes  <e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_DIV_{SF,DF}: New tuning macros.
	* config/aarch64/aarch64-protos.h
	(AARCH64_EXTRA_TUNE_APPROX_DIV): New macro.
	(aarch64_emit_approx_div): Declare new function.
	* config/aarch64/aarch64.c
	(aarch64_emit_approx_div): Define new function.
	* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
	* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h         |  4 ++
 gcc/config/aarch64/aarch64-simd.md          | 26 ++++++++++-
 gcc/config/aarch64/aarch64-tuning-flags.def |  3 +-
 gcc/config/aarch64/aarch64.c                | 67 ++++++++++++++++++++++++++++-
 gcc/config/aarch64/aarch64.md               | 31 +++++++++++--
 5 files changed, 124 insertions(+), 7 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index dced209..847a282 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags
 };
 #undef AARCH64_EXTRA_TUNING_OPTION
 
+#define AARCH64_EXTRA_TUNE_APPROX_DIV \
+        (AARCH64_EXTRA_TUNE_APPROX_DIV_DF | AARCH64_EXTRA_TUNE_APPROX_DIV_SF)
+
 extern struct tune_params aarch64_tune_params;
 
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
@@ -362,6 +365,7 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
 void aarch64_emit_approx_rsqrt (rtx, rtx);
+void aarch64_emit_approx_div (rtx, rtx, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..f1e53be 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,31 @@
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+       (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
+		 (match_operand:VDQF 2 "register_operand" "w")))]
+ "TARGET_SIMD"
+{
+  machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+  if (flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations
+      && !optimize_function_for_size_p (cfun)
+      && ((mode == SFmode
+           && (aarch64_tune_params.extra_tuning_flags
+               & AARCH64_EXTRA_TUNE_APPROX_DIV_SF))
+          || (mode == DFmode
+              && (aarch64_tune_params.extra_tuning_flags
+                  & AARCH64_EXTRA_TUNE_APPROX_DIV_DF))))
+    {
+      aarch64_emit_approx_div (operands[0], operands[1], operands[2]);
+      DONE;
+    }
+})
+
+(define_insn "*div<mode>3"
  [(set (match_operand:VDQF 0 "register_operand" "=w")
        (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
 		 (match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..ececdc1 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,5 @@
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
 AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_div", APPROX_DIV_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_divf", APPROX_DIV_SF)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 12e498d..97af0c0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -538,7 +538,8 @@ static const struct tune_params exynosm1_tunings =
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_APPROX_DIV
+   | AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -7540,6 +7541,70 @@ aarch64_emit_approx_rsqrt (rtx dst, rtx src)
   emit_move_insn (dst, x0);
 }
 
+/* Emit the instruction sequence to compute the approximation for FP division.  */
+
+void
+aarch64_emit_approx_div (rtx quo, rtx num, rtx div)
+{
+  machine_mode mode = GET_MODE (quo);
+  gcc_assert (GET_MODE_INNER (mode) == SFmode
+              || GET_MODE_INNER (mode) == DFmode);
+
+  rtx xnum = gen_reg_rtx (mode);
+  emit_move_insn (xnum, num);
+
+  rtx xdiv = gen_reg_rtx (mode);
+  emit_move_insn (xdiv, div);
+
+  /* Estimate the approximate reciprocal.  */
+  rtx xrcp = gen_reg_rtx (mode);
+  switch (mode)
+    {
+    case SFmode:
+      emit_insn (gen_aarch64_frecpesf (xrcp, xdiv)); break;
+    case V2SFmode:
+      emit_insn (gen_aarch64_frecpev2sf (xrcp, xdiv)); break;
+    case V4SFmode:
+      emit_insn (gen_aarch64_frecpev4sf (xrcp, xdiv)); break;
+    case DFmode:
+      emit_insn (gen_aarch64_frecpedf (xrcp, xdiv)); break;
+    case V2DFmode:
+      emit_insn (gen_aarch64_frecpev2df (xrcp, xdiv)); break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+  while (iterations--)
+    {
+      rtx xtmp = gen_reg_rtx (mode);
+
+      switch (mode)
+        {
+	    case SFmode:
+	      emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, xdiv)); break;
+	    case V2SFmode:
+	      emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, xdiv)); break;
+	    case V4SFmode:
+	      emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, xdiv)); break;
+	    case DFmode:
+	      emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, xdiv)); break;
+	    case V2DFmode:
+	      emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, xdiv)); break;
+        default:
+          gcc_unreachable ();
+        }
+
+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xtmp, xrcp));
+    }
+
+  rtx xquo = gen_reg_rtx (mode);
+  emit_set_insn (xquo, gen_rtx_MULT (mode, xnum, xrcp));
+
+  emit_move_insn (quo, xquo);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..b5d61db 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4647,11 +4647,34 @@
   [(set_attr "type" "fmul<s>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand" "=w")
+       (div:GPF (match_operand:GPF 1 "register_operand" "w")
+		(match_operand:GPF 2 "register_operand" "w")))]
+ "TARGET_SIMD"
+{
+  machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+  if (flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations
+      && !optimize_function_for_size_p (cfun)
+      && ((mode == SFmode
+           && (aarch64_tune_params.extra_tuning_flags
+               & AARCH64_EXTRA_TUNE_APPROX_DIV_SF))
+          || (mode == DFmode
+              && (aarch64_tune_params.extra_tuning_flags
+                  & AARCH64_EXTRA_TUNE_APPROX_DIV_DF))))
+    {
+      aarch64_emit_approx_div (operands[0], operands[1], operands[2]);
+      DONE;
+    }
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-        (div:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+        (div:GPF (match_operand:GPF 1 "register_operand" "w")
+	         (match_operand:GPF 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fdiv\\t%<s>0, %<s>1, %<s>2"
   [(set_attr "type" "fdiv<s>")]
-- 
1.9.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-04-27 15:43 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-17 21:14 [AArch64] Emit division using the Newton series Evandro Menezes
2016-03-23 16:24 ` Evandro Menezes
2016-03-23 16:25 ` Evandro Menezes
2016-03-31 22:23   ` Evandro Menezes
2016-04-01 13:58     ` Wilco Dijkstra
2016-04-01 19:47       ` Evandro Menezes
2016-04-01 21:22         ` Wilco Dijkstra
2016-04-01 21:56           ` Evandro Menezes
2016-04-01 22:46             ` Wilco Dijkstra
2016-04-01 22:52               ` Evandro Menezes
2016-04-04 19:06                 ` Evandro Menezes
2016-04-12 18:15                   ` Evandro Menezes
2016-04-21 18:44                     ` Evandro Menezes
2016-04-27 14:16                   ` James Greenhalgh
2016-04-27 14:44                     ` Wilco Dijkstra
2016-04-27 15:43                     ` Evandro Menezes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).