[PATCH 3/3][AArch64] Emit division using the Newton series

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 3/3][AArch64] Emit division using the Newton series
@ 2016-04-27 21:16 Evandro Menezes
  2016-05-25 18:57 ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Evandro Menezes @ 2016-04-27 21:16 UTC (permalink / raw)
  To: GCC Patches, James Greenhalgh, Wilco Dijkstra, Andrew Pinski,
	Philipp Tomsich, Benedikt Huber

[-- Attachment #1: Type: text/plain, Size: 903 bytes --]

    gcc/
         * config/aarch64/aarch64-protos.h
         (tune_params): Add new member "approx_div_modes".
         (aarch64_emit_approx_div): Declare new function.
         * config/aarch64/aarch64.c
         (generic_tunings): New member "approx_div_modes".
         (cortexa35_tunings): Likewise.
         (cortexa53_tunings): Likewise.
         (cortexa57_tunings): Likewise.
         (cortexa72_tunings): Likewise.
         (exynosm1_tunings): Likewise.
         (thunderx_tunings): Likewise.
         (xgene1_tunings): Likewise.
         (aarch64_emit_approx_div): Define new function.
         * config/aarch64/aarch64.md ("div<mode>3"): New expansion.
         * config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
         * config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
         * doc/invoke.texi (-mlow-precision-div): Describe new option.


-- 
Evandro Menezes


[-- Attachment #2: 0003-AArch64-Emit-division-using-the-Newton-series.patch --]
[-- Type: text/x-patch, Size: 11568 bytes --]

From 0bdd18af83a82377dd6b954c4e64904f6022a2b2 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series

2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <Wilco.Dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-protos.h
	(tune_params): Add new member "approx_div_modes".
	(aarch64_emit_approx_div): Declare new function.
	* config/aarch64/aarch64.c
	(generic_tunings): New member "approx_div_modes".
	(cortexa35_tunings): Likewise.
	(cortexa53_tunings): Likewise.
	(cortexa57_tunings): Likewise.
	(cortexa72_tunings): Likewise.
	(exynosm1_tunings): Likewise.
	(thunderx_tunings): Likewise.
	(xgene1_tunings): Likewise.
	(aarch64_emit_approx_div): Define new function.
	* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
	* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
	* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
	* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
 gcc/config/aarch64/aarch64-protos.h |  2 +
 gcc/config/aarch64/aarch64-simd.md  | 14 +++++-
 gcc/config/aarch64/aarch64.c        | 85 +++++++++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.md       | 19 +++++++--
 gcc/config/aarch64/aarch64.opt      |  5 +++
 gcc/doc/invoke.texi                 | 10 +++++
 6 files changed, 130 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 437f6af..ce7d147 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -244,6 +244,7 @@ struct tune_params
   } autoprefetcher_model;
 
   unsigned int extra_tuning_flags;
+  unsigned int approx_div_modes;
   unsigned int approx_sqrt_modes;
   unsigned int approx_rsqrt_modes;
 };
@@ -398,6 +399,7 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 47ccb18..7e99e16 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+       (div:VDQF (match_operand:VDQF 1 "general_operand")
+		 (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
  [(set (match_operand:VDQF 0 "register_operand" "=w")
        (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
 		 (match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 589871b..d3e73bf 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -417,6 +417,7 @@ static const struct tune_params generic_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -444,6 +445,7 @@ static const struct tune_params cortexa35_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -471,6 +473,7 @@ static const struct tune_params cortexa53_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -498,6 +501,7 @@ static const struct tune_params cortexa57_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -525,6 +529,7 @@ static const struct tune_params cortexa72_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -551,6 +556,7 @@ static const struct tune_params exynosm1_tunings =
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
+  (AARCH64_APPROX_NONE), /* approx_div_modes.  */
   (AARCH64_APPROX_ALL), /* approx_sqrt_modes.  */
   (AARCH64_APPROX_ALL) /* approx_rsqrt_modes.  */
 };
@@ -577,6 +583,7 @@ static const struct tune_params thunderx_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
@@ -603,6 +610,7 @@ static const struct tune_params xgene1_tunings =
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE),	/* approx_div_modes.  */
   (AARCH64_APPROX_NONE),	/* approx_sqrt_modes.  */
   (AARCH64_APPROX_ALL)	/* approx_rsqrt_modes.  */
 };
@@ -7604,6 +7612,83 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
   return true;
 }
 
+/* Emit the instruction sequence to compute the approximation for a division.  */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx div)
+{
+  machine_mode mode = GET_MODE (quo);
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !(flag_mlow_precision_div
+	   || (aarch64_tune_params.approx_div_modes & AARCH64_APPROX_MODE (mode))))
+    return false;
+
+  /* Estimate the approximate reciprocal.  */
+  rtx xrcp = gen_reg_rtx (mode);
+  switch (mode)
+    {
+      case SFmode:
+	emit_insn (gen_aarch64_frecpesf (xrcp, div)); break;
+      case V2SFmode:
+	emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break;
+      case V4SFmode:
+	emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break;
+      case DFmode:
+	emit_insn (gen_aarch64_frecpedf (xrcp, div)); break;
+      case V2DFmode:
+	emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break;
+      default:
+	gcc_unreachable ();
+    }
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance,
+     while sacrificing the accuracy.  */
+  if (flag_mlow_precision_div)
+    iterations--;
+
+  /* Iterate over the series to calculate the approximate reciprocal.  */
+  rtx xtmp = gen_reg_rtx (mode);
+  while (iterations--)
+    {
+      switch (mode)
+        {
+	  case SFmode:
+	    emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break;
+	  case V2SFmode:
+	    emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break;
+	  case V4SFmode:
+	    emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break;
+	  case DFmode:
+	    emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break;
+	  case V2DFmode:
+	    emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break;
+	  default:
+	    gcc_unreachable ();
+        }
+
+      if (iterations > 0)
+	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+    }
+
+  if (num != CONST1_RTX (mode))
+    {
+      /* Calculate the approximate division.  */
+      rtx xnum = force_reg (mode, num);
+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+    }
+
+  /* Return the approximation.  */
+  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+  return true;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index aab3e00..a248f06 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,11 +4665,22 @@
   [(set_attr "type" "fmul<s>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+       (div:GPF (match_operand:GPF 1 "general_operand")
+		(match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-        (div:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+        (div:GPF (match_operand:GPF 1 "register_operand" "w")
+	         (match_operand:GPF 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fdiv\\t%<s>0, %<s>1, %<s>2"
   [(set_attr "type" "fdiv<s>")]
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index ffd5540..760bd50 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -158,3 +158,8 @@ mlow-precision-sqrt
 Common Var(flag_mlow_precision_sqrt) Optimization
 When calculating the approximate square root,
 use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+When calculating the approximate division,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 76b7a5c..5769ca2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -575,6 +575,7 @@ Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12951,6 +12952,15 @@ uses one less step than otherwise, thus reducing latency and precision.
 This is only relevant if @option{-ffast-math} enables the square root
 approximation.
 
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
2.6.3


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-04-27 21:16 [PATCH 3/3][AArch64] Emit division using the Newton series Evandro Menezes
@ 2016-05-25 18:57 ` James Greenhalgh
  2016-05-28 13:53   ` Evandro Menezes
  0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2016-05-25 18:57 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: GCC Patches, Wilco Dijkstra, Andrew Pinski, Philipp Tomsich,
	Benedikt Huber, nd

On Wed, Apr 27, 2016 at 04:15:53PM -0500, Evandro Menezes wrote:
>    gcc/
>         * config/aarch64/aarch64-protos.h
>         (tune_params): Add new member "approx_div_modes".
>         (aarch64_emit_approx_div): Declare new function.
>         * config/aarch64/aarch64.c
>         (generic_tunings): New member "approx_div_modes".
>         (cortexa35_tunings): Likewise.
>         (cortexa53_tunings): Likewise.
>         (cortexa57_tunings): Likewise.
>         (cortexa72_tunings): Likewise.
>         (exynosm1_tunings): Likewise.
>         (thunderx_tunings): Likewise.
>         (xgene1_tunings): Likewise.
>         (aarch64_emit_approx_div): Define new function.
>         * config/aarch64/aarch64.md ("div<mode>3"): New expansion.
>         * config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
>         * config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
>         * doc/invoke.texi (-mlow-precision-div): Describe new option.

My comments from the other two patches around using a structure to
group up the tuning flags and whether we really want the new option
apply here too.

This code has no consumers by default and is only used for
-mlow-precision-div. Is this option likely to be useful to our users in
practice? It might all be more palatable under something like the rs6000's
-mrecip=opt .

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 47ccb18..7e99e16 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1509,7 +1509,19 @@
>    [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
>  )
>  
> -(define_insn "div<mode>3"
> +(define_expand "div<mode>3"
> + [(set (match_operand:VDQF 0 "register_operand")
> +       (div:VDQF (match_operand:VDQF 1 "general_operand")

What does this relaxation to general_operand give you?

> +		 (match_operand:VDQF 2 "register_operand")))]
> + "TARGET_SIMD"
> +{
> +  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
> +    DONE;
> +
> +  operands[1] = force_reg (<MODE>mode, operands[1]);

...other than the need to do this (sorry if I've missed something obvious).

> +})
> +
> +(define_insn "*div<mode>3"
>   [(set (match_operand:VDQF 0 "register_operand" "=w")
>         (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
>  		 (match_operand:VDQF 2 "register_operand" "w")))]
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 589871b..d3e73bf 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -7604,6 +7612,83 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
>    return true;
>  }
>  
> +/* Emit the instruction sequence to compute the approximation for a division.  */

Long line, missing details on what the return type means and the meaning of
arguments.

> +
> +bool
> +aarch64_emit_approx_div (rtx quo, rtx num, rtx div)

DIV is ambiguous (divisor, or the RTX or the division itself?) "DIVISOR" is
not much more typing and is clear.

> +{
> +  machine_mode mode = GET_MODE (quo);
> +
> +  if (!flag_finite_math_only
> +      || flag_trapping_math
> +      || !flag_unsafe_math_optimizations
> +      || optimize_function_for_size_p (cfun)
> +      || !(flag_mlow_precision_div
> +	   || (aarch64_tune_params.approx_div_modes & AARCH64_APPROX_MODE (mode))))

Long line.

> +    return false;
> +
> +  /* Estimate the approximate reciprocal.  */
> +  rtx xrcp = gen_reg_rtx (mode);
> +  switch (mode)
> +    {
> +      case SFmode:
> +	emit_insn (gen_aarch64_frecpesf (xrcp, div)); break;
> +      case V2SFmode:
> +	emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break;
> +      case V4SFmode:
> +	emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break;
> +      case DFmode:
> +	emit_insn (gen_aarch64_frecpedf (xrcp, div)); break;
> +      case V2DFmode:
> +	emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break;
> +      default:
> +	gcc_unreachable ();
> +    }

Factor this to get_recpe_type or similar (as was done for get_rsqrts_type).

> +
> +  /* Iterate over the series twice for SF and thrice for DF.  */
> +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
> +
> +  /* Optionally iterate over the series once less for faster performance,
> +     while sacrificing the accuracy.  */
> +  if (flag_mlow_precision_div)
> +    iterations--;
> +
> +  /* Iterate over the series to calculate the approximate reciprocal.  */
> +  rtx xtmp = gen_reg_rtx (mode);
> +  while (iterations--)
> +    {
> +      switch (mode)
> +        {
> +	  case SFmode:
> +	    emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break;
> +	  case V2SFmode:
> +	    emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break;
> +	  case V4SFmode:
> +	    emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break;
> +	  case DFmode:
> +	    emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break;
> +	  case V2DFmode:
> +	    emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break;
> +	  default:
> +	    gcc_unreachable ();
> +        }
> +
> +      if (iterations > 0)
> +	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
> +    }
> +
> +  if (num != CONST1_RTX (mode))
> +    {
> +      /* Calculate the approximate division.  */
> +      rtx xnum = force_reg (mode, num);
> +      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
> +    }
> +
> +  /* Return the approximation.  */
> +  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
> +  return true;
> +}
> +
>  /* Return the number of instructions that can be issued per cycle.  */
>  static int
>  aarch64_sched_issue_rate (void)
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index aab3e00..a248f06 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4665,11 +4665,22 @@
>    [(set_attr "type" "fmul<s>")]
>  )
>  
> -(define_insn "div<mode>3"
> +(define_expand "div<mode>3"
> + [(set (match_operand:GPF 0 "register_operand")
> +       (div:GPF (match_operand:GPF 1 "general_operand")
> +		(match_operand:GPF 2 "register_operand")))]
> + "TARGET_SIMD"
> +{
> +  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
> +    DONE;
> +
> +  operands[1] = force_reg (<MODE>mode, operands[1]);
> +})
> +

Same comment as above regarding general_operand.

> +(define_insn "*div<mode>3"
>    [(set (match_operand:GPF 0 "register_operand" "=w")
> -        (div:GPF
> -         (match_operand:GPF 1 "register_operand" "w")
> -         (match_operand:GPF 2 "register_operand" "w")))]
> +        (div:GPF (match_operand:GPF 1 "register_operand" "w")
> +	         (match_operand:GPF 2 "register_operand" "w")))]
>    "TARGET_FLOAT"
>    "fdiv\\t%<s>0, %<s>1, %<s>2"
>    [(set_attr "type" "fdiv<s>")]

Thanks,
James

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-05-25 18:57 ` James Greenhalgh
@ 2016-05-28 13:53   ` Evandro Menezes
  2016-05-31 10:54     ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Evandro Menezes @ 2016-05-28 13:53 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: GCC Patches, Wilco Dijkstra, Andrew Pinski, Philipp Tomsich,
	Benedikt Huber, nd

[-- Attachment #1: Type: text/plain, Size: 7426 bytes --]

On 05/25/16 11:16, James Greenhalgh wrote:
> On Wed, Apr 27, 2016 at 04:15:53PM -0500, Evandro Menezes wrote:
>>     gcc/
>>          * config/aarch64/aarch64-protos.h
>>          (tune_params): Add new member "approx_div_modes".
>>          (aarch64_emit_approx_div): Declare new function.
>>          * config/aarch64/aarch64.c
>>          (generic_tunings): New member "approx_div_modes".
>>          (cortexa35_tunings): Likewise.
>>          (cortexa53_tunings): Likewise.
>>          (cortexa57_tunings): Likewise.
>>          (cortexa72_tunings): Likewise.
>>          (exynosm1_tunings): Likewise.
>>          (thunderx_tunings): Likewise.
>>          (xgene1_tunings): Likewise.
>>          (aarch64_emit_approx_div): Define new function.
>>          * config/aarch64/aarch64.md ("div<mode>3"): New expansion.
>>          * config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
>>          * config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
>>          * doc/invoke.texi (-mlow-precision-div): Describe new option.
> My comments from the other two patches around using a structure to
> group up the tuning flags and whether we really want the new option
> apply here too.
>
> This code has no consumers by default and is only used for
> -mlow-precision-div. Is this option likely to be useful to our users in
> practice? It might all be more palatable under something like the rs6000's
> -mrecip=opt .

I agree.  OK as a follow up?

>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index 47ccb18..7e99e16 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -1509,7 +1509,19 @@
>>     [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
>>   )
>>   
>> -(define_insn "div<mode>3"
>> +(define_expand "div<mode>3"
>> + [(set (match_operand:VDQF 0 "register_operand")
>> +       (div:VDQF (match_operand:VDQF 1 "general_operand")
> What does this relaxation to general_operand give you?

Hold that thought...

>> +		 (match_operand:VDQF 2 "register_operand")))]
>> + "TARGET_SIMD"
>> +{
>> +  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
>> +    DONE;
>> +
>> +  operands[1] = force_reg (<MODE>mode, operands[1]);
> ...other than the need to do this (sorry if I've missed something obvious).

Hold on...

>> +})
>> +
>> +(define_insn "*div<mode>3"
>>    [(set (match_operand:VDQF 0 "register_operand" "=w")
>>          (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
>>   		 (match_operand:VDQF 2 "register_operand" "w")))]
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 589871b..d3e73bf 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -7604,6 +7612,83 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
>>     return true;
>>   }
>>   
>> +/* Emit the instruction sequence to compute the approximation for a division.  */
> Long line, missing details on what the return type means and the meaning of
> arguments.

OK

>> +
>> +bool
>> +aarch64_emit_approx_div (rtx quo, rtx num, rtx div)
> DIV is ambiguous (divisor, or the RTX or the division itself?) "DIVISOR" is
> not much more typing and is clear.

I renamed it to imply the denominator.

>> +{
>> +  machine_mode mode = GET_MODE (quo);
>> +
>> +  if (!flag_finite_math_only
>> +      || flag_trapping_math
>> +      || !flag_unsafe_math_optimizations
>> +      || optimize_function_for_size_p (cfun)
>> +      || !(flag_mlow_precision_div
>> +	   || (aarch64_tune_params.approx_div_modes & AARCH64_APPROX_MODE (mode))))
> Long line.

OK

>> +    return false;
>> +
>> +  /* Estimate the approximate reciprocal.  */
>> +  rtx xrcp = gen_reg_rtx (mode);
>> +  switch (mode)
>> +    {
>> +      case SFmode:
>> +	emit_insn (gen_aarch64_frecpesf (xrcp, div)); break;
>> +      case V2SFmode:
>> +	emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break;
>> +      case V4SFmode:
>> +	emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break;
>> +      case DFmode:
>> +	emit_insn (gen_aarch64_frecpedf (xrcp, div)); break;
>> +      case V2DFmode:
>> +	emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break;
>> +      default:
>> +	gcc_unreachable ();
>> +    }
> Factor this to get_recpe_type or similar (as was done for get_rsqrts_type).

OK

>> +
>> +  /* Iterate over the series twice for SF and thrice for DF.  */
>> +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
>> +
>> +  /* Optionally iterate over the series once less for faster performance,
>> +     while sacrificing the accuracy.  */
>> +  if (flag_mlow_precision_div)
>> +    iterations--;
>> +
>> +  /* Iterate over the series to calculate the approximate reciprocal.  */
>> +  rtx xtmp = gen_reg_rtx (mode);
>> +  while (iterations--)
>> +    {
>> +      switch (mode)
>> +        {
>> +	  case SFmode:
>> +	    emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break;
>> +	  case V2SFmode:
>> +	    emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break;
>> +	  case V4SFmode:
>> +	    emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break;
>> +	  case DFmode:
>> +	    emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break;
>> +	  case V2DFmode:
>> +	    emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break;
>> +	  default:
>> +	    gcc_unreachable ();
>> +        }
>> +
>> +      if (iterations > 0)
>> +	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
>> +    }
>> +
>> +  if (num != CONST1_RTX (mode))
>> +    {
>> +      /* Calculate the approximate division.  */
>> +      rtx xnum = force_reg (mode, num);
>> +      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
>> +    }

About that relaxation, as you can see here, since the series 
approximates the reciprocal of the denominator, if the numerator is 1.0, 
a register can be spared, as the result is ready and the numerator is 
not needed.

>> +
>> +  /* Return the approximation.  */
>> +  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
>> +  return true;
>> +}
>> +
>>   /* Return the number of instructions that can be issued per cycle.  */
>>   static int
>>   aarch64_sched_issue_rate (void)
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index aab3e00..a248f06 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -4665,11 +4665,22 @@
>>     [(set_attr "type" "fmul<s>")]
>>   )
>>   
>> -(define_insn "div<mode>3"
>> +(define_expand "div<mode>3"
>> + [(set (match_operand:GPF 0 "register_operand")
>> +       (div:GPF (match_operand:GPF 1 "general_operand")
>> +		(match_operand:GPF 2 "register_operand")))]
>> + "TARGET_SIMD"
>> +{
>> +  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
>> +    DONE;
>> +
>> +  operands[1] = force_reg (<MODE>mode, operands[1]);
>> +})
>> +
> Same comment as above regarding general_operand.

I hope that I answered this question above.

>> +(define_insn "*div<mode>3"
>>     [(set (match_operand:GPF 0 "register_operand" "=w")
>> -        (div:GPF
>> -         (match_operand:GPF 1 "register_operand" "w")
>> -         (match_operand:GPF 2 "register_operand" "w")))]
>> +        (div:GPF (match_operand:GPF 1 "register_operand" "w")
>> +	         (match_operand:GPF 2 "register_operand" "w")))]
>>     "TARGET_FLOAT"
>>     "fdiv\\t%<s>0, %<s>1, %<s>2"
>>     [(set_attr "type" "fdiv<s>")]

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: 0003-AArch64-Emit-division-using-the-Newton-series.patch --]
[-- Type: text/x-patch, Size: 9791 bytes --]

From a7d49bfa27cd3ae325a49092707bec0cdb659bb5 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series

2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <Wilco.Dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-protos.h
	(cpu_approx_modes): Add new member "division".
	(aarch64_emit_approx_div): Declare new function.
	* config/aarch64/aarch64.c
	(generic_approx_modes): New member "division".
	(exynosm1_approx_modes): Likewise.
	(xgene1_approx_modes): Likewise.
	(aarch64_emit_approx_div): Define new function.
	* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
	* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
	* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
	* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
 gcc/config/aarch64/aarch64-protos.h |  2 +
 gcc/config/aarch64/aarch64-simd.md  | 14 +++++-
 gcc/config/aarch64/aarch64.c        | 92 +++++++++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.md       | 19 ++++++--
 gcc/config/aarch64/aarch64.opt      |  5 ++
 gcc/doc/invoke.texi                 | 10 ++++
 6 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2f407fd..3d10b00 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -192,6 +192,7 @@ struct cpu_branch_cost
 /* Allowed modes for approximations.  */
 struct cpu_approx_modes
 {
+  const unsigned int division;		/* Division.  */
   const unsigned int sqrt;		/* Square root.  */
   const unsigned int recip_sqrt;	/* Reciprocal square root.  */
 };
@@ -390,6 +391,7 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 47ccb18..7e99e16 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+       (div:VDQF (match_operand:VDQF 1 "general_operand")
+		 (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
  [(set (match_operand:VDQF 0 "register_operand" "=w")
        (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
 		 (match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d55ca34..34f8faf 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -397,6 +397,7 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
 /* Generic approximation modes.  */
 static const cpu_approx_modes generic_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_NONE	/* recip_sqrt  */
 };
@@ -404,6 +405,7 @@ static const cpu_approx_modes generic_approx_modes =
 /* Approximation modes for Exynos M1.  */
 static const cpu_approx_modes exynosm1_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_ALL,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
@@ -411,6 +413,7 @@ static const cpu_approx_modes exynosm1_approx_modes =
 /* Approximation modes for Xgene1.  */
 static const cpu_approx_modes xgene1_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
@@ -7619,6 +7622,95 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
   return true;
 }
 
+typedef rtx (*recpe_type) (rtx, rtx);
+
+/* Select reciprocal initial estimate insn depending on machine mode.  */
+
+static recpe_type
+get_recpe_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpesf);
+    case V2SFmode: return (gen_aarch64_frecpev2sf);
+    case V4SFmode: return (gen_aarch64_frecpev4sf);
+    case DFmode:   return (gen_aarch64_frecpedf);
+    case V2DFmode: return (gen_aarch64_frecpev2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+typedef rtx (*recps_type) (rtx, rtx, rtx);
+
+/* Select reciprocal series step insn depending on machine mode.  */
+
+static recps_type
+get_recps_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpssf);
+    case V2SFmode: return (gen_aarch64_frecpsv2sf);
+    case V4SFmode: return (gen_aarch64_frecpsv4sf);
+    case DFmode:   return (gen_aarch64_frecpsdf);
+    case V2DFmode: return (gen_aarch64_frecpsv2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+/* Emit the instruction sequence to compute the approximation for the division
+   of NUM by DEN and return whether the sequence was emitted or not.  */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
+{
+  machine_mode mode = GET_MODE (quo);
+  bool use_approx_division_p = (flag_mlow_precision_div
+			        || (aarch64_tune_params.approx_modes->division
+				    & AARCH64_APPROX_MODE (mode)));
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !use_approx_division_p)
+    return false;
+
+  /* Estimate the approximate reciprocal.  */
+  rtx xrcp = gen_reg_rtx (mode);
+  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance,
+     while sacrificing the accuracy.  */
+  if (flag_mlow_precision_div)
+    iterations--;
+
+  /* Iterate over the series to calculate the approximate reciprocal.  */
+  rtx xtmp = gen_reg_rtx (mode);
+  while (iterations--)
+    {
+      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
+
+      if (iterations > 0)
+	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+    }
+
+  if (num != CONST1_RTX (mode))
+    {
+      /* As the approximate reciprocal of the denominator is already calculated,
+         only calculate the approximate division when the numerator is not 1.0.  */
+      rtx xnum = force_reg (mode, num);
+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+    }
+
+  /* Finalize the approximation.  */
+  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+  return true;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index aab3e00..a248f06 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,11 +4665,22 @@
   [(set_attr "type" "fmul<s>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+       (div:GPF (match_operand:GPF 1 "general_operand")
+		(match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-        (div:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+        (div:GPF (match_operand:GPF 1 "register_operand" "w")
+	         (match_operand:GPF 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fdiv\\t%<s>0, %<s>1, %<s>2"
   [(set_attr "type" "fdiv<s>")]
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index ffd5540..760bd50 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -158,3 +158,8 @@ mlow-precision-sqrt
 Common Var(flag_mlow_precision_sqrt) Optimization
 When calculating the approximate square root,
 use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+When calculating the approximate division,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 76b7a5c..5769ca2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -575,6 +575,7 @@ Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12951,6 +12952,15 @@ uses one less step than otherwise, thus reducing latency and precision.
 This is only relevant if @option{-ffast-math} enables the square root
 approximation.
 
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
2.6.3


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-05-28 13:53   ` Evandro Menezes
@ 2016-05-31 10:54     ` James Greenhalgh
  2016-05-31 21:31       ` Evandro Menezes
  0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2016-05-31 10:54 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: GCC Patches, Wilco Dijkstra, Andrew Pinski, Philipp Tomsich,
	Benedikt Huber, nd

On Fri, May 27, 2016 at 05:57:30PM -0500, Evandro Menezes wrote:
> On 05/25/16 11:16, James Greenhalgh wrote:
> >On Wed, Apr 27, 2016 at 04:15:53PM -0500, Evandro Menezes wrote:
> >>    gcc/
> >>         * config/aarch64/aarch64-protos.h
> >>         (tune_params): Add new member "approx_div_modes".
> >>         (aarch64_emit_approx_div): Declare new function.
> >>         * config/aarch64/aarch64.c
> >>         (generic_tunings): New member "approx_div_modes".
> >>         (cortexa35_tunings): Likewise.
> >>         (cortexa53_tunings): Likewise.
> >>         (cortexa57_tunings): Likewise.
> >>         (cortexa72_tunings): Likewise.
> >>         (exynosm1_tunings): Likewise.
> >>         (thunderx_tunings): Likewise.
> >>         (xgene1_tunings): Likewise.
> >>         (aarch64_emit_approx_div): Define new function.
> >>         * config/aarch64/aarch64.md ("div<mode>3"): New expansion.
> >>         * config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
> >>         * config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
> >>         * doc/invoke.texi (-mlow-precision-div): Describe new option.
> >My comments from the other two patches around using a structure to
> >group up the tuning flags and whether we really want the new option
> >apply here too.
> >
> >This code has no consumers by default and is only used for
> >-mlow-precision-div. Is this option likely to be useful to our users in
> >practice? It might all be more palatable under something like the rs6000's
> >-mrecip=opt .
> 
> I agree.  OK as a follow up?

Works for me.

> >>diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> >>index 47ccb18..7e99e16 100644
> >>--- a/gcc/config/aarch64/aarch64-simd.md
> >>+++ b/gcc/config/aarch64/aarch64-simd.md
> >>@@ -1509,7 +1509,19 @@
> >>    [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
> >>  )
> >>-(define_insn "div<mode>3"
> >>+(define_expand "div<mode>3"
> >>+ [(set (match_operand:VDQF 0 "register_operand")
> >>+       (div:VDQF (match_operand:VDQF 1 "general_operand")
> >What does this relaxation to general_operand give you?
> 
> Hold that thought...
> 
> >>+		 (match_operand:VDQF 2 "register_operand")))]
> >>+ "TARGET_SIMD"
> >>+{
> >>+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
> >>+    DONE;
> >>+
> >>+  operands[1] = force_reg (<MODE>mode, operands[1]);
> >...other than the need to do this (sorry if I've missed something obvious).
> 
> Hold on...
> 
> >>+  if (num != CONST1_RTX (mode))
> >>+    {
> >>+      /* Calculate the approximate division.  */
> >>+      rtx xnum = force_reg (mode, num);
> >>+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
> >>+    }
> 
> About that relaxation, as you can see here, since the series
> approximates the reciprocal of the denominator, if the numerator is
> 1.0, a register can be spared, as the result is ready and the
> numerator is not needed.

But, in the case that the multiplication is by 1, can we not rely on the
other optimization machinery eliminating it? I mean, I see the optimization
that this enables for you, but can't you rely on future passes to do the
cleanup, and save yourself the few lines of special casing?

> +/* Emit the instruction sequence to compute the approximation for the division
> +   of NUM by DEN and return whether the sequence was emitted or not.  */

Needs a brief mention of what we use QUO for :).

> +
> +bool
> +aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
> +{
> +  machine_mode mode = GET_MODE (quo);
> +  bool use_approx_division_p = (flag_mlow_precision_div
> +			        || (aarch64_tune_params.approx_modes->division
> +				    & AARCH64_APPROX_MODE (mode)));
> +
> +  if (!flag_finite_math_only
> +      || flag_trapping_math
> +      || !flag_unsafe_math_optimizations
> +      || optimize_function_for_size_p (cfun)
> +      || !use_approx_division_p)
> +    return false;
> +
> +  /* Estimate the approximate reciprocal.  */
> +  rtx xrcp = gen_reg_rtx (mode);
> +  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
> +
> +  /* Iterate over the series twice for SF and thrice for DF.  */
> +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
> +
> +  /* Optionally iterate over the series once less for faster performance,
> +     while sacrificing the accuracy.  */
> +  if (flag_mlow_precision_div)
> +    iterations--;
> +
> +  /* Iterate over the series to calculate the approximate reciprocal.  */
> +  rtx xtmp = gen_reg_rtx (mode);
> +  while (iterations--)
> +    {
> +      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
> +
> +      if (iterations > 0)
> +	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
> +    }
> +
> +  if (num != CONST1_RTX (mode))
> +    {
> +      /* As the approximate reciprocal of the denominator is already calculated,
> +         only calculate the approximate division when the numerator is not 1.0.  */

Long lines.

> +      rtx xnum = force_reg (mode, num);
> +      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
> +    }
> +
> +  /* Finalize the approximation.  */
> +  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
> +  return true;
> +}
> +
>  /* Return the number of instructions that can be issued per cycle.  */
>  static int
>  aarch64_sched_issue_rate (void)

Thanks,
James

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-05-31 10:54     ` James Greenhalgh
@ 2016-05-31 21:31       ` Evandro Menezes
  2016-06-03 21:50         ` Evandro Menezes
  0 siblings, 1 reply; 10+ messages in thread
From: Evandro Menezes @ 2016-05-31 21:31 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 5696 bytes --]

On 05/31/16 04:27, James Greenhalgh wrote:
> On Fri, May 27, 2016 at 05:57:30PM -0500, Evandro Menezes wrote:
>> On 05/25/16 11:16, James Greenhalgh wrote:
>>> On Wed, Apr 27, 2016 at 04:15:53PM -0500, Evandro Menezes wrote:
>>>>     gcc/
>>>>          * config/aarch64/aarch64-protos.h
>>>>          (tune_params): Add new member "approx_div_modes".
>>>>          (aarch64_emit_approx_div): Declare new function.
>>>>          * config/aarch64/aarch64.c
>>>>          (generic_tunings): New member "approx_div_modes".
>>>>          (cortexa35_tunings): Likewise.
>>>>          (cortexa53_tunings): Likewise.
>>>>          (cortexa57_tunings): Likewise.
>>>>          (cortexa72_tunings): Likewise.
>>>>          (exynosm1_tunings): Likewise.
>>>>          (thunderx_tunings): Likewise.
>>>>          (xgene1_tunings): Likewise.
>>>>          (aarch64_emit_approx_div): Define new function.
>>>>          * config/aarch64/aarch64.md ("div<mode>3"): New expansion.
>>>>          * config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
>>>>          * config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
>>>>          * doc/invoke.texi (-mlow-precision-div): Describe new option.
>>> My comments from the other two patches around using a structure to
>>> group up the tuning flags and whether we really want the new option
>>> apply here too.
>>>
>>> This code has no consumers by default and is only used for
>>> -mlow-precision-div. Is this option likely to be useful to our users in
>>> practice? It might all be more palatable under something like the rs6000's
>>> -mrecip=opt .
>> I agree.  OK as a follow up?
> Works for me.
>
>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>>>> index 47ccb18..7e99e16 100644
>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>> @@ -1509,7 +1509,19 @@
>>>>     [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
>>>>   )
>>>> -(define_insn "div<mode>3"
>>>> +(define_expand "div<mode>3"
>>>> + [(set (match_operand:VDQF 0 "register_operand")
>>>> +       (div:VDQF (match_operand:VDQF 1 "general_operand")
>>> What does this relaxation to general_operand give you?
>> Hold that thought...
>>
>>>> +		 (match_operand:VDQF 2 "register_operand")))]
>>>> + "TARGET_SIMD"
>>>> +{
>>>> +  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
>>>> +    DONE;
>>>> +
>>>> +  operands[1] = force_reg (<MODE>mode, operands[1]);
>>> ...other than the need to do this (sorry if I've missed something obvious).
>> Hold on...
>>
>>>> +  if (num != CONST1_RTX (mode))
>>>> +    {
>>>> +      /* Calculate the approximate division.  */
>>>> +      rtx xnum = force_reg (mode, num);
>>>> +      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
>>>> +    }
>> About that relaxation, as you can see here, since the series
>> approximates the reciprocal of the denominator, if the numerator is
>> 1.0, a register can be spared, as the result is ready and the
>> numerator is not needed.
> But, in the case that the multiplication is by 1, can we not rely on the
> other optimization machinery eliminating it? I mean, I see the optimization
> that this enables for you, but can't you rely on future passes to do the
> cleanup, and save yourself the few lines of special casing?

I prefer to not rely on subsequent passes, as, though it may work now, 
that may change in the future.  Methinks that it's a simple enough test 
to achieve better control of the resulting code.

>> +/* Emit the instruction sequence to compute the approximation for the division
>> +   of NUM by DEN and return whether the sequence was emitted or not.  */
> Needs a brief mention of what we use QUO for :).

OK

>> +
>> +bool
>> +aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
>> +{
>> +  machine_mode mode = GET_MODE (quo);
>> +  bool use_approx_division_p = (flag_mlow_precision_div
>> +			        || (aarch64_tune_params.approx_modes->division
>> +				    & AARCH64_APPROX_MODE (mode)));
>> +
>> +  if (!flag_finite_math_only
>> +      || flag_trapping_math
>> +      || !flag_unsafe_math_optimizations
>> +      || optimize_function_for_size_p (cfun)
>> +      || !use_approx_division_p)
>> +    return false;
>> +
>> +  /* Estimate the approximate reciprocal.  */
>> +  rtx xrcp = gen_reg_rtx (mode);
>> +  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
>> +
>> +  /* Iterate over the series twice for SF and thrice for DF.  */
>> +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
>> +
>> +  /* Optionally iterate over the series once less for faster performance,
>> +     while sacrificing the accuracy.  */
>> +  if (flag_mlow_precision_div)
>> +    iterations--;
>> +
>> +  /* Iterate over the series to calculate the approximate reciprocal.  */
>> +  rtx xtmp = gen_reg_rtx (mode);
>> +  while (iterations--)
>> +    {
>> +      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
>> +
>> +      if (iterations > 0)
>> +	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
>> +    }
>> +
>> +  if (num != CONST1_RTX (mode))
>> +    {
>> +      /* As the approximate reciprocal of the denominator is already calculated,
>> +         only calculate the approximate division when the numerator is not 1.0.  */
> Long lines.

OK

>> +      rtx xnum = force_reg (mode, num);
>> +      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
>> +    }
>> +
>> +  /* Finalize the approximation.  */
>> +  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
>> +  return true;
>> +}
>> +
>>   /* Return the number of instructions that can be issued per cycle.  */
>>   static int
>>   aarch64_sched_issue_rate (void)

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: 0003-AArch64-Emit-division-using-the-Newton-series.patch --]
[-- Type: text/x-patch, Size: 9770 bytes --]

From e598b0df2e43f2e64254a0e1ddec608fd78025b1 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series

2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <Wilco.Dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-protos.h
	(cpu_approx_modes): Add new member "division".
	(aarch64_emit_approx_div): Declare new function.
	* config/aarch64/aarch64.c
	(generic_approx_modes): New member "division".
	(exynosm1_approx_modes): Likewise.
	(xgene1_approx_modes): Likewise.
	(aarch64_emit_approx_div): Define new function.
	* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
	* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
	* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
	* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
 gcc/config/aarch64/aarch64-protos.h |  2 +
 gcc/config/aarch64/aarch64-simd.md  | 14 +++++-
 gcc/config/aarch64/aarch64.c        | 92 +++++++++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.md       | 19 ++++++--
 gcc/config/aarch64/aarch64.opt      |  5 ++
 gcc/doc/invoke.texi                 | 10 ++++
 6 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index fae558a..4b8e607 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -192,6 +192,7 @@ struct cpu_branch_cost
 /* Allowed modes for approximations.  */
 struct cpu_approx_modes
 {
+  const unsigned int division;		/* Division.  */
   const unsigned int sqrt;		/* Square root.  */
   const unsigned int recip_sqrt;	/* Reciprocal square root.  */
 };
@@ -387,6 +388,7 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 883e2ed..96a62ac 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+       (div:VDQF (match_operand:VDQF 1 "general_operand")
+		 (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
  [(set (match_operand:VDQF 0 "register_operand" "=w")
        (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
 		 (match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index abf983a..f7d7239 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -397,6 +397,7 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
 /* Generic approximation modes.  */
 static const cpu_approx_modes generic_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_NONE	/* recip_sqrt  */
 };
@@ -404,6 +405,7 @@ static const cpu_approx_modes generic_approx_modes =
 /* Approximation modes for Exynos M1.  */
 static const cpu_approx_modes exynosm1_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_ALL,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
@@ -411,6 +413,7 @@ static const cpu_approx_modes exynosm1_approx_modes =
 /* Approximation modes for Xgene1.  */
 static const cpu_approx_modes xgene1_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
@@ -7486,6 +7489,95 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
   return true;
 }
 
+typedef rtx (*recpe_type) (rtx, rtx);
+
+/* Select reciprocal initial estimate insn depending on machine mode.  */
+
+static recpe_type
+get_recpe_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpesf);
+    case V2SFmode: return (gen_aarch64_frecpev2sf);
+    case V4SFmode: return (gen_aarch64_frecpev4sf);
+    case DFmode:   return (gen_aarch64_frecpedf);
+    case V2DFmode: return (gen_aarch64_frecpev2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+typedef rtx (*recps_type) (rtx, rtx, rtx);
+
+/* Select reciprocal series step insn depending on machine mode.  */
+
+static recps_type
+get_recps_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpssf);
+    case V2SFmode: return (gen_aarch64_frecpsv2sf);
+    case V4SFmode: return (gen_aarch64_frecpsv4sf);
+    case DFmode:   return (gen_aarch64_frecpsdf);
+    case V2DFmode: return (gen_aarch64_frecpsv2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+/* Emit the instruction sequence to compute the approximation for the division
+   of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
+{
+  machine_mode mode = GET_MODE (quo);
+  bool use_approx_division_p = (flag_mlow_precision_div
+			        || (aarch64_tune_params.approx_modes->division
+				    & AARCH64_APPROX_MODE (mode)));
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !use_approx_division_p)
+    return false;
+
+  /* Estimate the approximate reciprocal.  */
+  rtx xrcp = gen_reg_rtx (mode);
+  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance,
+     while sacrificing the accuracy.  */
+  if (flag_mlow_precision_div)
+    iterations--;
+
+  /* Iterate over the series to calculate the approximate reciprocal.  */
+  rtx xtmp = gen_reg_rtx (mode);
+  while (iterations--)
+    {
+      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
+
+      if (iterations > 0)
+	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+    }
+
+  if (num != CONST1_RTX (mode))
+    {
+      /* As the approximate reciprocal of DEN is already calculated, only
+	 calculate the approximate division when NUM is not 1.0.  */
+      rtx xnum = force_reg (mode, num);
+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+    }
+
+  /* Finalize the approximation.  */
+  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+  return true;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 14ca9a1..74b99b8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4674,11 +4674,22 @@
   [(set_attr "type" "fmul<s>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+       (div:GPF (match_operand:GPF 1 "general_operand")
+		(match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-        (div:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+        (div:GPF (match_operand:GPF 1 "register_operand" "w")
+	         (match_operand:GPF 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fdiv\\t%<s>0, %<s>1, %<s>2"
   [(set_attr "type" "fdiv<s>")]
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index ffd5540..760bd50 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -158,3 +158,8 @@ mlow-precision-sqrt
 Common Var(flag_mlow_precision_sqrt) Optimization
 When calculating the approximate square root,
 use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+When calculating the approximate division,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 0e27837..6d657998 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -577,6 +577,7 @@ Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -13010,6 +13011,15 @@ uses one less step than otherwise, thus reducing latency and precision.
 This is only relevant if @option{-ffast-math} enables the square root
 approximation.
 
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
2.6.3


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-05-31 21:31       ` Evandro Menezes
@ 2016-06-03 21:50         ` Evandro Menezes
  2016-06-13 10:15           ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Evandro Menezes @ 2016-06-03 21:50 UTC (permalink / raw)
  To: gcc-patches, Wilco Dijkstra, Andrew Pinski, James Greenhalgh,
	philipp.tomsich, Benedikt Huber, nd

[-- Attachment #1: Type: text/plain, Size: 44 bytes --]

Rebasing the patch...

-- 
Evandro Menezes


[-- Attachment #2: 0003-AArch64-Emit-division-using-the-Newton-series.patch --]
[-- Type: text/x-patch, Size: 9929 bytes --]

From d791090aae6a29fa94d8fc10894ee1053b05bcc2 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series

2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <Wilco.Dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-protos.h
	(cpu_approx_modes): Add new member "division".
	(aarch64_emit_approx_div): Declare new function.
	* config/aarch64/aarch64.c
	(generic_approx_modes): New member "division".
	(exynosm1_approx_modes): Likewise.
	(xgene1_approx_modes): Likewise.
	(aarch64_emit_approx_div): Define new function.
	* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
	* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
	* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
	* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
 gcc/config/aarch64/aarch64-protos.h |  2 +
 gcc/config/aarch64/aarch64-simd.md  | 14 +++++-
 gcc/config/aarch64/aarch64.c        | 92 +++++++++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.md       | 19 ++++++--
 gcc/config/aarch64/aarch64.opt      |  6 +++
 gcc/doc/invoke.texi                 | 10 ++++
 6 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index eb33118..3e0a0a3 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -192,6 +192,7 @@ struct cpu_branch_cost
 /* Allowed modes for approximations.  */
 struct cpu_approx_modes
 {
+  const unsigned int division;		/* Division.  */
   const unsigned int sqrt;		/* Square root.  */
   const unsigned int recip_sqrt;	/* Reciprocal square root.  */
 };
@@ -303,6 +304,7 @@ int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_constant_address_p (rtx);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2a5c665..a244a27 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+       (div:VDQF (match_operand:VDQF 1 "general_operand")
+		 (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
  [(set (match_operand:VDQF 0 "register_operand" "=w")
        (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
 		 (match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ca6035d..7b85a85 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -396,6 +396,7 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
 /* Generic approximation modes.  */
 static const cpu_approx_modes generic_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_NONE	/* recip_sqrt  */
 };
@@ -403,6 +404,7 @@ static const cpu_approx_modes generic_approx_modes =
 /* Approximation modes for Exynos M1.  */
 static const cpu_approx_modes exynosm1_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_ALL,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
@@ -410,6 +412,7 @@ static const cpu_approx_modes exynosm1_approx_modes =
 /* Approximation modes for X-Gene 1.  */
 static const cpu_approx_modes xgene1_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* division  */
   AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
@@ -7487,6 +7490,95 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
   return true;
 }
 
+typedef rtx (*recpe_type) (rtx, rtx);
+
+/* Select reciprocal initial estimate insn depending on machine mode.  */
+
+static recpe_type
+get_recpe_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpesf);
+    case V2SFmode: return (gen_aarch64_frecpev2sf);
+    case V4SFmode: return (gen_aarch64_frecpev4sf);
+    case DFmode:   return (gen_aarch64_frecpedf);
+    case V2DFmode: return (gen_aarch64_frecpev2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+typedef rtx (*recps_type) (rtx, rtx, rtx);
+
+/* Select reciprocal series step insn depending on machine mode.  */
+
+static recps_type
+get_recps_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case SFmode:   return (gen_aarch64_frecpssf);
+    case V2SFmode: return (gen_aarch64_frecpsv2sf);
+    case V4SFmode: return (gen_aarch64_frecpsv4sf);
+    case DFmode:   return (gen_aarch64_frecpsdf);
+    case V2DFmode: return (gen_aarch64_frecpsv2df);
+    default:       gcc_unreachable ();
+  }
+}
+
+/* Emit the instruction sequence to compute the approximation for the division
+   of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
+{
+  machine_mode mode = GET_MODE (quo);
+  bool use_approx_division_p = (flag_mlow_precision_div
+			        || (aarch64_tune_params.approx_modes->division
+				    & AARCH64_APPROX_MODE (mode)));
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !use_approx_division_p)
+    return false;
+
+  /* Estimate the approximate reciprocal.  */
+  rtx xrcp = gen_reg_rtx (mode);
+  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance,
+     while sacrificing the accuracy.  */
+  if (flag_mlow_precision_div)
+    iterations--;
+
+  /* Iterate over the series to calculate the approximate reciprocal.  */
+  rtx xtmp = gen_reg_rtx (mode);
+  while (iterations--)
+    {
+      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
+
+      if (iterations > 0)
+	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+    }
+
+  if (num != CONST1_RTX (mode))
+    {
+      /* As the approximate reciprocal of DEN is already calculated, only
+	 calculate the approximate division when NUM is not 1.0.  */
+      rtx xnum = force_reg (mode, num);
+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+    }
+
+  /* Finalize the approximation.  */
+  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+  return true;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index ba7d606..fbc6225 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4674,11 +4674,22 @@
   [(set_attr "type" "fmul<s>")]
 )
 
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+       (div:GPF (match_operand:GPF 1 "general_operand")
+		(match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+    DONE;
+
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-        (div:GPF
-         (match_operand:GPF 1 "register_operand" "w")
-         (match_operand:GPF 2 "register_operand" "w")))]
+        (div:GPF (match_operand:GPF 1 "register_operand" "w")
+	         (match_operand:GPF 2 "register_operand" "w")))]
   "TARGET_FLOAT"
   "fdiv\\t%<s>0, %<s>1, %<s>2"
   [(set_attr "type" "fdiv<s>")]
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 3c4e7ae..bf6b475 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -161,3 +161,9 @@ Enable the square root approximation.  Enabling this reduces
 precision of square root results to about 16 bits for
 single precision and to 32 bits for double precision.
 If enabled, it implies -mlow-precision-recip-sqrt.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+Enable the division approximation.  Enabling this reduces
+precision of division results to about 16 bits for
+single precision and to 32 bits for double precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 73a3fb8..4d7bcb7 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -577,6 +577,7 @@ Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -13032,6 +13033,15 @@ precision of square root results to about 16 bits for
 single precision and to 32 bits for double precision.
 If enabled, it implies @option{-mlow-precision-recip-sqrt}.
 
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
2.6.3


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-06-03 21:50         ` Evandro Menezes
@ 2016-06-13 10:15           ` James Greenhalgh
  2016-06-13 19:06             ` Evandro Menezes
  0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2016-06-13 10:15 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: gcc-patches, Wilco Dijkstra, Andrew Pinski, philipp.tomsich,
	Benedikt Huber, nd

On Fri, Jun 03, 2016 at 04:50:24PM -0500, Evandro Menezes wrote:
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 73a3fb8..4d7bcb7 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -577,6 +577,7 @@ Objective-C and Objective-C++ Dialects}.
>  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
>  -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
>  -mlow-precision-sqrt -mno-low-precision-sqrt@gol
> +-mlow-precision-div -mno-low-precision-div @gol
>  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
>  
>  @emph{Adapteva Epiphany Options}
> @@ -13032,6 +13033,15 @@ precision of square root results to about 16 bits for
>  single precision and to 32 bits for double precision.
>  If enabled, it implies @option{-mlow-precision-recip-sqrt}.
>  
> +@item -mlow-precision-div
> +@item -mno-low-precision-div
> +@opindex -mlow-precision-div
> +@opindex -mno-low-precision-div
> +When calculating the division approximation,
> +uses one less step than otherwise, thus reducing latency and precision.

s/uses/use/

Otherwise, this is ok for trunk.

Thanks for your patience on this patch series.

Thanks,
James

> From d791090aae6a29fa94d8fc10894ee1053b05bcc2 Mon Sep 17 00:00:00 2001
> From: Evandro Menezes <e.menezes@samsung.com>
> Date: Mon, 4 Apr 2016 14:02:24 -0500
> Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series
> 
> 2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
>             Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> 
> gcc/
> 	* config/aarch64/aarch64-protos.h
> 	(cpu_approx_modes): Add new member "division".
> 	(aarch64_emit_approx_div): Declare new function.
> 	* config/aarch64/aarch64.c
> 	(generic_approx_modes): New member "division".
> 	(exynosm1_approx_modes): Likewise.
> 	(xgene1_approx_modes): Likewise.
> 	(aarch64_emit_approx_div): Define new function.
> 	* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
> 	* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
> 	* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
> 	* doc/invoke.texi (-mlow-precision-div): Describe new option.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-06-13 10:15           ` James Greenhalgh
@ 2016-06-13 19:06             ` Evandro Menezes
  2016-06-14  8:29               ` Christophe Lyon
  0 siblings, 1 reply; 10+ messages in thread
From: Evandro Menezes @ 2016-06-13 19:06 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: gcc-patches, Wilco Dijkstra, Andrew Pinski, philipp.tomsich,
	Benedikt Huber, nd

On 06/13/16 05:15, James Greenhalgh wrote:
> Thanks for your patience on this patch series. 

Just checked the series in.

Thank y'all for your assistance and patience.

Cheers,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-06-13 19:06             ` Evandro Menezes
@ 2016-06-14  8:29               ` Christophe Lyon
  2016-06-14 15:59                 ` Evandro Menezes
  0 siblings, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2016-06-14  8:29 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: James Greenhalgh, gcc-patches, Wilco Dijkstra, Andrew Pinski,
	philipp.tomsich, Benedikt Huber, nd

On 13 June 2016 at 21:06, Evandro Menezes <e.menezes@samsung.com> wrote:
> On 06/13/16 05:15, James Greenhalgh wrote:
>>
>> Thanks for your patience on this patch series.
>
>
> Just checked the series in.
>
Hi Evandro,
If I'm not mistaken, it looks like you forgot to update the ChangeLog
files in your commits.

Christophe

> Thank y'all for your assistance and patience.
>
> Cheers,
>
> --
> Evandro Menezes
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3][AArch64] Emit division using the Newton series
  2016-06-14  8:29               ` Christophe Lyon
@ 2016-06-14 15:59                 ` Evandro Menezes
  0 siblings, 0 replies; 10+ messages in thread
From: Evandro Menezes @ 2016-06-14 15:59 UTC (permalink / raw)
  To: Christophe Lyon
  Cc: James Greenhalgh, gcc-patches, Wilco Dijkstra, Andrew Pinski,
	philipp.tomsich, Benedikt Huber, nd

On 06/14/16 03:28, Christophe Lyon wrote:
> On 13 June 2016 at 21:06, Evandro Menezes <e.menezes@samsung.com> wrote:
>> On 06/13/16 05:15, James Greenhalgh wrote:
>>> Thanks for your patience on this patch series.
>>
>> Just checked the series in.
>>
> If I'm not mistaken, it looks like you forgot to update the ChangeLog
> files in your commits.

Oh, @#$%!  Sorry about that.  Will update the ChangeLog accordingly.

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-06-14 15:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-27 21:16 [PATCH 3/3][AArch64] Emit division using the Newton series Evandro Menezes
2016-05-25 18:57 ` James Greenhalgh
2016-05-28 13:53   ` Evandro Menezes
2016-05-31 10:54     ` James Greenhalgh
2016-05-31 21:31       ` Evandro Menezes
2016-06-03 21:50         ` Evandro Menezes
2016-06-13 10:15           ` James Greenhalgh
2016-06-13 19:06             ` Evandro Menezes
2016-06-14  8:29               ` Christophe Lyon
2016-06-14 15:59                 ` Evandro Menezes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).