From cbc2b62f7df5c3e2fef2a24157b1bdd1a6de191b Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] Emit division using the Newton series

2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra <Wilco.Dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-tuning-flags.def
	* config/aarch64/aarch64-protos.h
	(tune_params): Add new member "approx_div_modes".
	(aarch64_emit_approx_div): Declare new function.
	* config/aarch64/aarch64.c
	(generic_tunings): New member "approx_div_modes".
	(cortexa35_tunings): Likewise.
	(cortexa53_tunings): Likewise.
	(cortexa57_tunings): Likewise.
	(cortexa72_tunings): Likewise.
	(exynosm1_tunings): Likewise.
	(thunderx_tunings): Likewise.
	(xgene1_tunings): Likewise.
	(aarch64_emit_approx_div): Define new function.
	* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
	* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
	* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
	* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
 gcc/config/aarch64/aarch64-protos.h |  2 +
 gcc/config/aarch64/aarch64-simd.md  | 14 +++++-
 gcc/config/aarch64/aarch64.c        | 85 +++++++++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.md       | 19 +++++++--
 gcc/config/aarch64/aarch64.opt      |  5 +++
 gcc/doc/invoke.texi                 | 10 +++++
 6 files changed, 130 insertions(+), 5 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 85ad796..649faf7 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -244,6 +244,7 @@ struct tune_params
   } autoprefetcher_model;
 
   unsigned int extra_tuning_flags;
+  unsigned int approx_div_modes;
   unsigned int approx_sqrt_modes;
   unsigned int approx_rsqrt_modes;
 };
@@ -390,6 +391,7 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 47ccb18..7e99e16 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+       (div:VDQF (match_operand:VDQF 1 "general_operand")
+		 (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
  [(set (match_operand:VDQF 0 "register_operand" "=w")
        (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
 		 (match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4af2175..74310e8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -417,6 +417,7 @@ static const struct tune_params generic_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -444,6 +445,7 @@ static const struct tune_params cortexa35_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -471,6 +473,7 @@ static const struct tune_params cortexa53_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -498,6 +501,7 @@ static const struct tune_params cortexa57_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -525,6 +529,7 @@ static const struct tune_params cortexa72_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -551,6 +556,7 @@ static const struct tune_params exynosm1_tunings =
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
+  (AARCH64_APPROX_NONE), /* approx_div_modes.  */
   (AARCH64_APPROX_ALL), /* approx_sqrt_modes.  */
   (AARCH64_APPROX_ALL) /* approx_rsqrt_modes.  */
 };
@@ -577,6 +583,7 @@ static const struct tune_params thunderx_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -603,6 +610,7 @@ static const struct tune_params xgene1_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_ALL)	/* approx_rsqrt_modes.  */
 };
@@ -7604,6 +7612,83 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
   return true;
 }
 
+/* Emit the instruction sequence to compute the approximation for a division.  */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx div)
+{
+  machine_mode mode = GET_MODE (quo);
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !(flag_mlow_precision_div
+	   || (aarch64_tune_params.approx_div_modes & AARCH64_APPROX_MODE (mode))))
+    return false;
+
+  /* Estimate the approximate reciprocal.  */
+  rtx xrcp = gen_reg_rtx (mode);
+  switch (mode)
+    {
+      case SFmode:
+	emit_insn (gen_aarch64_frecpesf (xrcp, div)); break;
+      case V2SFmode:
+	emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break;
+      case V4SFmode:
+	emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break;
+      case DFmode:
+	emit_insn (gen_aarch64_frecpedf (xrcp, div)); break;
+      case V2DFmode:
+	emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break;
+      default:
+	gcc_unreachable ();
+    }
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance,
+     while sacrificing the accuracy.  */
+  if (flag_mlow_precision_div)
+    iterations--;
+
+  /* Iterate over the series to calculate the approximate reciprocal.  */
+  rtx xtmp = gen_reg_rtx (mode);
+  while (iterations--)
+    {
+      switch (mode)
+        {
+	  case SFmode:
+	    emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break;
+	  case V2SFmode:
+	    emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break;
+	  case V4SFmode:
+	    emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break;
+	  case DFmode:
+	    emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break;
+	  case V2DFmode:
+	    emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break;
+	  default:
+	    gcc_unreachable ();
+        }
+
+      if (iterations > 0)
+	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+    }
+
+  if (num != CONST1_RTX (mode))
+    {
+      /* Calculate the approximate division.  */
+      rtx xnum = force_reg (mode, num);
+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+    }
+
+  /* Return the approximation.  */
+  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+  return true;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 43fa318..b42ce1a 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4647,11 +4647,22 @@
   [(set_attr "type" "fmul<s>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+       (div:GPF (match_operand:GPF 1 "general_operand")
+		(match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-        (div:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+        (div:GPF (match_operand:GPF 1 "register_operand" "w")
+	         (match_operand:GPF 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fdiv\\t%<s>0, %<s>1, %<s>2"
   [(set_attr "type" "fdiv<s>")]
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index ffd5540..760bd50 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -158,3 +158,8 @@ mlow-precision-sqrt
 Common Var(flag_mlow_precision_sqrt) Optimization
 When calculating the approximate square root,
 use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+When calculating the approximate division,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 01c3e87..8d33997 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -574,6 +574,7 @@ Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12931,6 +12932,15 @@ uses one less step than otherwise, thus reducing latency and precision.
 This is only relevant if @option{-ffast-math} enables the square root
 approximation.
 
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
1.9.1