From 750bd4f64cea8787eb077b7537cc7d8dceafac57 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Thu, 17 Mar 2016 14:44:55 -0500 Subject: [PATCH] Emit division using the Newton series 2016-03-17 Evandro Menezes gcc/ * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNE_APPROX_DIV_{SF,DF}: New tuning macros. * config/aarch64/aarch64-protos.h (AARCH64_EXTRA_TUNE_APPROX_DIV): New macro. (aarch64_emit_approx_div): Declare new function. * config/aarch64/aarch64.c (aarch64_emit_approx_div): Define new function. * config/aarch64/aarch64.md ("div3"): New expansion. * config/aarch64/aarch64-simd.md ("div3"): Likewise. --- gcc/config/aarch64/aarch64-protos.h | 4 ++ gcc/config/aarch64/aarch64-simd.md | 26 ++++++++++- gcc/config/aarch64/aarch64-tuning-flags.def | 3 +- gcc/config/aarch64/aarch64.c | 67 ++++++++++++++++++++++++++++- gcc/config/aarch64/aarch64.md | 31 +++++++++++-- 5 files changed, 124 insertions(+), 7 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index dced209..847a282 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags }; #undef AARCH64_EXTRA_TUNING_OPTION +#define AARCH64_EXTRA_TUNE_APPROX_DIV \ + (AARCH64_EXTRA_TUNE_APPROX_DIV_DF | AARCH64_EXTRA_TUNE_APPROX_DIV_SF) + extern struct tune_params aarch64_tune_params; HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned); @@ -362,6 +365,7 @@ void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); void aarch64_save_restore_target_globals (tree); void aarch64_emit_approx_rsqrt (rtx, rtx); +void aarch64_emit_approx_div (rtx, rtx, rtx); /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bd73bce..f1e53be 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1509,7 +1509,31 @@ [(set_attr "type" "neon_fp_mul_")] ) -(define_insn "div3" +(define_expand "div3" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (div:VDQF (match_operand:VDQF 1 "register_operand" "w") + (match_operand:VDQF 2 "register_operand" "w")))] + "TARGET_SIMD" +{ + machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1])); + + if (flag_finite_math_only + && !flag_trapping_math + && flag_unsafe_math_optimizations + && !optimize_function_for_size_p (cfun) + && ((mode == SFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_DIV_SF)) + || (mode == DFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_DIV_DF)))) + { + aarch64_emit_approx_div (operands[0], operands[1], operands[2]); + DONE; + } +}) + +(define_insn "*div3" [(set (match_operand:VDQF 0 "register_operand" "=w") (div:VDQF (match_operand:VDQF 1 "register_operand" "w") (match_operand:VDQF 2 "register_operand" "w")))] diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 7e45a0c..ececdc1 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -30,4 +30,5 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT) - +AARCH64_EXTRA_TUNING_OPTION ("approx_div", APPROX_DIV_DF) +AARCH64_EXTRA_TUNING_OPTION ("approx_divf", APPROX_DIV_SF) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 12e498d..97af0c0 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -538,7 +538,8 @@ static const struct tune_params exynosm1_tunings = 48, /* max_case_values. */ 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_APPROX_DIV + | AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ }; static const struct tune_params thunderx_tunings = @@ -7540,6 +7541,70 @@ aarch64_emit_approx_rsqrt (rtx dst, rtx src) emit_move_insn (dst, x0); } +/* Emit the instruction sequence to compute the approximation for FP division. */ + +void +aarch64_emit_approx_div (rtx quo, rtx num, rtx div) +{ + machine_mode mode = GET_MODE (quo); + gcc_assert (GET_MODE_INNER (mode) == SFmode + || GET_MODE_INNER (mode) == DFmode); + + rtx xnum = gen_reg_rtx (mode); + emit_move_insn (xnum, num); + + rtx xdiv = gen_reg_rtx (mode); + emit_move_insn (xdiv, div); + + /* Estimate the approximate reciprocal. */ + rtx xrcp = gen_reg_rtx (mode); + switch (mode) + { + case SFmode: + emit_insn (gen_aarch64_frecpesf (xrcp, xdiv)); break; + case V2SFmode: + emit_insn (gen_aarch64_frecpev2sf (xrcp, xdiv)); break; + case V4SFmode: + emit_insn (gen_aarch64_frecpev4sf (xrcp, xdiv)); break; + case DFmode: + emit_insn (gen_aarch64_frecpedf (xrcp, xdiv)); break; + case V2DFmode: + emit_insn (gen_aarch64_frecpev2df (xrcp, xdiv)); break; + default: + gcc_unreachable (); + } + + /* Iterate over the series twice for SF and thrice for DF. */ + int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; + while (iterations--) + { + rtx xtmp = gen_reg_rtx (mode); + + switch (mode) + { + case SFmode: + emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, xdiv)); break; + case V2SFmode: + emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, xdiv)); break; + case V4SFmode: + emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, xdiv)); break; + case DFmode: + emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, xdiv)); break; + case V2DFmode: + emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, xdiv)); break; + default: + gcc_unreachable (); + } + + emit_set_insn (xrcp, gen_rtx_MULT (mode, xtmp, xrcp)); + } + + rtx xquo = gen_reg_rtx (mode); + emit_set_insn (xquo, gen_rtx_MULT (mode, xnum, xrcp)); + + emit_move_insn (quo, xquo); +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 68676c9..b5d61db 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4647,11 +4647,34 @@ [(set_attr "type" "fmul")] ) -(define_insn "div3" +(define_expand "div3" + [(set (match_operand:GPF 0 "register_operand" "=w") + (div:GPF (match_operand:GPF 1 "register_operand" "w") + (match_operand:GPF 2 "register_operand" "w")))] + "TARGET_SIMD" +{ + machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1])); + + if (flag_finite_math_only + && !flag_trapping_math + && flag_unsafe_math_optimizations + && !optimize_function_for_size_p (cfun) + && ((mode == SFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_DIV_SF)) + || (mode == DFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_DIV_DF)))) + { + aarch64_emit_approx_div (operands[0], operands[1], operands[2]); + DONE; + } +}) + +(define_insn "*div3" [(set (match_operand:GPF 0 "register_operand" "=w") - (div:GPF - (match_operand:GPF 1 "register_operand" "w") - (match_operand:GPF 2 "register_operand" "w")))] + (div:GPF (match_operand:GPF 1 "register_operand" "w") + (match_operand:GPF 2 "register_operand" "w")))] "TARGET_FLOAT" "fdiv\\t%0, %1, %2" [(set_attr "type" "fdiv")] -- 1.9.1