From 4f61f722f744339650a48aa034906dd685110ae2 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Tue, 8 Mar 2016 15:06:03 -0600 Subject: [PATCH] Emit square root using the Newton series gcc/ * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNE_APPROX_SQRT_{DF,SF}): New tuning macros. * config/aarch64/aarch64-protos.h (aarch64_emit_approx_sqrt): Declare new function. * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Define new function. * config/aarch64/aarch64.md (sqrt*2): New expansion and insn definitions. * config/aarch64/aarch64-simd.md (sqrt*2): Likewise. * config/aarch64/aarch64.opt (mlow-precision-recip-sqrt): Expand option description. * doc/invoke.texi (mlow-precision-recip-sqrt): Likewise. --- gcc/config/aarch64/aarch64-protos.h | 3 +++ gcc/config/aarch64/aarch64-simd.md | 25 ++++++++++++++++++++- gcc/config/aarch64/aarch64-tuning-flags.def | 3 ++- gcc/config/aarch64/aarch64.c | 35 ++++++++++++++++++++++++----- gcc/config/aarch64/aarch64.md | 25 ++++++++++++++++++++- gcc/config/aarch64/aarch64.opt | 4 ++-- gcc/doc/invoke.texi | 9 ++++---- 7 files changed, 89 insertions(+), 15 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ee3505c..3f7e76b 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -265,6 +265,8 @@ enum aarch64_extra_tuning_flags #define AARCH64_EXTRA_TUNE_APPROX_RSQRT \ (AARCH64_EXTRA_TUNE_APPROX_RSQRT_DF | AARCH64_EXTRA_TUNE_APPROX_RSQRT_SF) +#define AARCH64_EXTRA_TUNE_APPROX_SQRT \ + (AARCH64_EXTRA_TUNE_APPROX_SQRT_DF | AARCH64_EXTRA_TUNE_APPROX_SQRT_SF) extern struct tune_params aarch64_tune_params; @@ -364,6 +366,7 @@ void aarch64_register_pragmas (void); void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); void aarch64_emit_approx_rsqrt (rtx, rtx); +void aarch64_emit_approx_sqrt (rtx, rtx); /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bd73bce..afeca5a 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4307,7 +4307,30 @@ ;; sqrt -(define_insn "sqrt2" +(define_expand "sqrt2" + [(set (match_operand:VDQF 0 "register_operand") + (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))] + "TARGET_SIMD" +{ + machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1])); + + if (flag_finite_math_only + && !flag_trapping_math + && flag_unsafe_math_optimizations + && !optimize_function_for_size_p (cfun) + && ((mode == SFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)) + || (mode == DFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF)))) + { + aarch64_emit_approx_sqrt (operands[0], operands[1]); + DONE; + } +}) + +(define_insn "*sqrt2" [(set (match_operand:VDQF 0 "register_operand" "=w") (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))] "TARGET_SIMD" diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 57d9588..b4421b1 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -31,4 +31,5 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT_DF) AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrtf", APPROX_RSQRT_SF) - +AARCH64_EXTRA_TUNING_OPTION ("approx_sqrt", APPROX_SQRT_DF) +AARCH64_EXTRA_TUNING_OPTION ("approx_sqrtf", APPROX_SQRT_SF) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 39a1a47..5e5dc5f 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -538,7 +538,8 @@ static const struct tune_params exynosm1_tunings = 48, /* max_case_values. */ 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_APPROX_SQRT_SF + | AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ }; static const struct tune_params thunderx_tunings = @@ -7537,9 +7538,8 @@ void aarch64_emit_approx_rsqrt (rtx dst, rtx src) { machine_mode mode = GET_MODE (src); - gcc_assert ( - mode == SFmode || mode == V2SFmode || mode == V4SFmode - || mode == DFmode || mode == V2DFmode); + gcc_assert (GET_MODE_INNER (mode) == SFmode + || GET_MODE_INNER (mode) == DFmode); rtx xsrc = gen_reg_rtx (mode); emit_move_insn (xsrc, src); @@ -7547,8 +7547,7 @@ aarch64_emit_approx_rsqrt (rtx dst, rtx src) emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc)); - bool double_mode = (mode == DFmode || mode == V2DFmode); - + bool double_mode = (GET_MODE_INNER (mode) == DFmode); int iterations = double_mode ? 3 : 2; /* Optionally iterate over the series one less time than otherwise. */ @@ -7571,6 +7570,30 @@ aarch64_emit_approx_rsqrt (rtx dst, rtx src) emit_move_insn (dst, x0); } +/* Emit instruction sequence to compute the approximate square root. */ + +void +aarch64_emit_approx_sqrt (rtx dst, rtx src) +{ + machine_mode mode = GET_MODE (src); + gcc_assert (GET_MODE_INNER (mode) == SFmode + || GET_MODE_INNER (mode) == DFmode); + + rtx xsrc = gen_reg_rtx (mode); + emit_move_insn (xsrc, src); + + /* Calculate the approximate square root by multiplying the approximate + reciprocal square root... */ + rtx xrsqrt = gen_reg_rtx (mode); + aarch64_emit_approx_rsqrt (xrsqrt, xsrc); + + /* ... by the original value. */ + rtx xsqrt = gen_reg_rtx (mode); + emit_set_insn (xsqrt, gen_rtx_MULT (mode, xrsqrt, xsrc)); + + emit_move_insn (dst, xsqrt); +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 68676c9..bd9947a 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4665,7 +4665,30 @@ [(set_attr "type" "ffarith")] ) -(define_insn "sqrt2" +(define_expand "sqrt2" + [(set (match_operand:GPF 0 "register_operand") + (sqrt:GPF (match_operand:GPF 1 "register_operand")))] + "TARGET_SIMD" +{ + machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1])); + + if (flag_finite_math_only + && !flag_trapping_math + && flag_unsafe_math_optimizations + && !optimize_function_for_size_p (cfun) + && ((mode == SFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)) + || (mode == DFmode + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF)))) + { + aarch64_emit_approx_sqrt (operands[0], operands[1]); + DONE; + } +}) + +(define_insn "*sqrt2" [(set (match_operand:GPF 0 "register_operand" "=w") (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))] "TARGET_FLOAT" diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index 49ef0c6..8bb12d6 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -151,5 +151,5 @@ PC relative literal loads. mlow-precision-recip-sqrt Common Var(flag_mrecip_low_precision_sqrt) Optimization -When calculating the reciprocal square root approximation, -uses one less step than otherwise, thus reducing latency and precision. +When calculating the approximate square root or its approximate reciprocal, +use one less step than otherwise, thus reducing latency and precision. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 62c70d5..24ad1f3 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -12887,10 +12887,11 @@ corresponding flag to the linker. @item -mno-low-precision-recip-sqrt @opindex -mlow-precision-recip-sqrt @opindex -mno-low-precision-recip-sqrt -When calculating the reciprocal square root approximation, -uses one less step than otherwise, thus reducing latency and precision. -This is only relevant if @option{-ffast-math} enables the reciprocal square root -approximation, which in turn depends on the target processor. +When calculating the approximate square root or its approximate reciprocal, +use one less step than otherwise, thus reducing latency and precision. +This is only relevant if @option{-ffast-math} enables +the approximate square root or its approximate reciprocal, +which in turn depends on the target processor. @item -march=@var{name} @opindex march -- 2.6.3