From cbc2b62f7df5c3e2fef2a24157b1bdd1a6de191b Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Mon, 4 Apr 2016 14:02:24 -0500 Subject: [PATCH 3/3] Emit division using the Newton series 2016-04-04 Evandro Menezes Wilco Dijkstra gcc/ * config/aarch64/aarch64-tuning-flags.def * config/aarch64/aarch64-protos.h (tune_params): Add new member "approx_div_modes". (aarch64_emit_approx_div): Declare new function. * config/aarch64/aarch64.c (generic_tunings): New member "approx_div_modes". (cortexa35_tunings): Likewise. (cortexa53_tunings): Likewise. (cortexa57_tunings): Likewise. (cortexa72_tunings): Likewise. (exynosm1_tunings): Likewise. (thunderx_tunings): Likewise. (xgene1_tunings): Likewise. (aarch64_emit_approx_div): Define new function. * config/aarch64/aarch64.md ("div3"): New expansion. * config/aarch64/aarch64-simd.md ("div3"): Likewise. * config/aarch64/aarch64.opt (-mlow-precision-div): Add new option. * doc/invoke.texi (-mlow-precision-div): Describe new option. --- gcc/config/aarch64/aarch64-protos.h | 2 + gcc/config/aarch64/aarch64-simd.md | 14 +++++- gcc/config/aarch64/aarch64.c | 85 +++++++++++++++++++++++++++++++++++++ gcc/config/aarch64/aarch64.md | 19 +++++++-- gcc/config/aarch64/aarch64.opt | 5 +++ gcc/doc/invoke.texi | 10 +++++ 6 files changed, 130 insertions(+), 5 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 85ad796..649faf7 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -244,6 +244,7 @@ struct tune_params } autoprefetcher_model; unsigned int extra_tuning_flags; + unsigned int approx_div_modes; unsigned int approx_sqrt_modes; unsigned int approx_rsqrt_modes; }; @@ -390,6 +391,7 @@ void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); void aarch64_save_restore_target_globals (tree); bool aarch64_emit_approx_sqrt (rtx, rtx, bool); +bool aarch64_emit_approx_div (rtx, rtx, rtx); /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 47ccb18..7e99e16 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1509,7 +1509,19 @@ [(set_attr "type" "neon_fp_mul_")] ) -(define_insn "div3" +(define_expand "div3" + [(set (match_operand:VDQF 0 "register_operand") + (div:VDQF (match_operand:VDQF 1 "general_operand") + (match_operand:VDQF 2 "register_operand")))] + "TARGET_SIMD" +{ + if (aarch64_emit_approx_div (operands[0], operands[1], operands[2])) + DONE; + + operands[1] = force_reg (mode, operands[1]); +}) + +(define_insn "*div3" [(set (match_operand:VDQF 0 "register_operand" "=w") (div:VDQF (match_operand:VDQF 1 "register_operand" "w") (match_operand:VDQF 2 "register_operand" "w")))] diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 4af2175..74310e8 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -417,6 +417,7 @@ static const struct tune_params generic_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -444,6 +445,7 @@ static const struct tune_params cortexa35_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -471,6 +473,7 @@ static const struct tune_params cortexa53_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -498,6 +501,7 @@ static const struct tune_params cortexa57_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -525,6 +529,7 @@ static const struct tune_params cortexa72_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -551,6 +556,7 @@ static const struct tune_params exynosm1_tunings = 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_ALL), /* approx_sqrt_modes. */ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */ }; @@ -577,6 +583,7 @@ static const struct tune_params thunderx_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -603,6 +610,7 @@ static const struct tune_params xgene1_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_div_modes. */ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */ }; @@ -7604,6 +7612,83 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) return true; } +/* Emit the instruction sequence to compute the approximation for a division. */ + +bool +aarch64_emit_approx_div (rtx quo, rtx num, rtx div) +{ + machine_mode mode = GET_MODE (quo); + + if (!flag_finite_math_only + || flag_trapping_math + || !flag_unsafe_math_optimizations + || optimize_function_for_size_p (cfun) + || !(flag_mlow_precision_div + || (aarch64_tune_params.approx_div_modes & AARCH64_APPROX_MODE (mode)))) + return false; + + /* Estimate the approximate reciprocal. */ + rtx xrcp = gen_reg_rtx (mode); + switch (mode) + { + case SFmode: + emit_insn (gen_aarch64_frecpesf (xrcp, div)); break; + case V2SFmode: + emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break; + case V4SFmode: + emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break; + case DFmode: + emit_insn (gen_aarch64_frecpedf (xrcp, div)); break; + case V2DFmode: + emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break; + default: + gcc_unreachable (); + } + + /* Iterate over the series twice for SF and thrice for DF. */ + int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; + + /* Optionally iterate over the series once less for faster performance, + while sacrificing the accuracy. */ + if (flag_mlow_precision_div) + iterations--; + + /* Iterate over the series to calculate the approximate reciprocal. */ + rtx xtmp = gen_reg_rtx (mode); + while (iterations--) + { + switch (mode) + { + case SFmode: + emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break; + case V2SFmode: + emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break; + case V4SFmode: + emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break; + case DFmode: + emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break; + case V2DFmode: + emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break; + default: + gcc_unreachable (); + } + + if (iterations > 0) + emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp)); + } + + if (num != CONST1_RTX (mode)) + { + /* Calculate the approximate division. */ + rtx xnum = force_reg (mode, num); + emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum)); + } + + /* Return the approximation. */ + emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp)); + return true; +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 43fa318..b42ce1a 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4647,11 +4647,22 @@ [(set_attr "type" "fmul")] ) -(define_insn "div3" +(define_expand "div3" + [(set (match_operand:GPF 0 "register_operand") + (div:GPF (match_operand:GPF 1 "general_operand") + (match_operand:GPF 2 "register_operand")))] + "TARGET_SIMD" +{ + if (aarch64_emit_approx_div (operands[0], operands[1], operands[2])) + DONE; + + operands[1] = force_reg (mode, operands[1]); +}) + +(define_insn "*div3" [(set (match_operand:GPF 0 "register_operand" "=w") - (div:GPF - (match_operand:GPF 1 "register_operand" "w") - (match_operand:GPF 2 "register_operand" "w")))] + (div:GPF (match_operand:GPF 1 "register_operand" "w") + (match_operand:GPF 2 "register_operand" "w")))] "TARGET_FLOAT" "fdiv\\t%0, %1, %2" [(set_attr "type" "fdiv")] diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index ffd5540..760bd50 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -158,3 +158,8 @@ mlow-precision-sqrt Common Var(flag_mlow_precision_sqrt) Optimization When calculating the approximate square root, use one less step than otherwise, thus reducing latency and precision. + +mlow-precision-div +Common Var(flag_mlow_precision_div) Optimization +When calculating the approximate division, +use one less step than otherwise, thus reducing latency and precision. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 01c3e87..8d33997 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -574,6 +574,7 @@ Objective-C and Objective-C++ Dialects}. -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol -mlow-precision-sqrt -mno-low-precision-sqrt@gol +-mlow-precision-div -mno-low-precision-div @gol -march=@var{name} -mcpu=@var{name} -mtune=@var{name}} @emph{Adapteva Epiphany Options} @@ -12931,6 +12932,15 @@ uses one less step than otherwise, thus reducing latency and precision. This is only relevant if @option{-ffast-math} enables the square root approximation. +@item -mlow-precision-div +@item -mno-low-precision-div +@opindex -mlow-precision-div +@opindex -mno-low-precision-div +When calculating the division approximation, +uses one less step than otherwise, thus reducing latency and precision. +This is only relevant if @option{-ffast-math} enables the division +approximation. + @item -march=@var{name} @opindex march Specify the name of the target architecture and, optionally, one or -- 1.9.1