From f173dace7b4137f8868a1a6ef9cdbbeefa92ffde Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Thu, 3 Dec 2015 15:25:07 -0600 Subject: [PATCH] Emit square root using the Newton series 2015-12-03 Evandro Menezes gcc/ * config/aarch64/aarch64-protos.h (aarch64_emit_swsqrt): Declare new function. * config/aarch64/aarch64-simd.md (sqrt2): New expansion and insn definitions. * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNE_FAST_SQRT): New tuning macro. * config/aarch64/aarch64.c (aarch64_emit_swsqrt): Define new function. * config/aarch64/aarch64.md (sqrt2): New expansion and insn definitions. * config/aarch64/aarch64.opt (mlow-precision-recip-sqrt): Expand option description. * doc/invoke.texi (mlow-precision-recip-sqrt): Likewise. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md | 18 +++++++++++++++++- gcc/config/aarch64/aarch64-tuning-flags.def | 2 +- gcc/config/aarch64/aarch64.c | 25 +++++++++++++++++++++++-- gcc/config/aarch64/aarch64.md | 18 +++++++++++++++++- gcc/config/aarch64/aarch64.opt | 2 +- gcc/doc/invoke.texi | 13 ++++++------- 7 files changed, 66 insertions(+), 13 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 1e0fb4e..7fe6074 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -356,6 +356,7 @@ void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); void aarch64_emit_swrsqrt (rtx, rtx); +void aarch64_emit_swsqrt (rtx, rtx); /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 030a101..f6d2da4 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4280,7 +4280,23 @@ ;; sqrt -(define_insn "sqrt2" +(define_expand "sqrt2" + [(set (match_operand:VDQF 0 "register_operand") + (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))] + "TARGET_SIMD" +{ + if ((AARCH64_EXTRA_TUNE_FAST_SQRT & aarch64_tune_params.extra_tuning_flags) + && !optimize_function_for_size_p (cfun) + && flag_finite_math_only + && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + aarch64_emit_swsqrt (operands[0], operands[1]); + DONE; + } +}) + +(define_insn "*sqrt2" [(set (match_operand:VDQF 0 "register_operand" "=w") (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))] "TARGET_SIMD" diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 6f7dbce..11c6c9a 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -30,4 +30,4 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT) - +AARCH64_EXTRA_TUNING_OPTION ("fast_sqrt", FAST_SQRT) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index ae4cfb3..3b58c35 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -533,8 +533,9 @@ static const struct tune_params exynosm1_tunings = 2, /* min_div_recip_mul_df. */ 48, /* max_case_values. */ 64, /* cache_line_size. */ - tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_RECIP_SQRT + | AARCH64_EXTRA_TUNE_FAST_SQRT) /* tune_flags. */ }; static const struct tune_params thunderx_tunings = @@ -7515,6 +7516,26 @@ aarch64_emit_swrsqrt (rtx dst, rtx src) emit_move_insn (dst, x0); } +/* Emit instruction sequence to compute the approximate square root. */ + +void +aarch64_emit_swsqrt (rtx dst, rtx src) +{ + machine_mode mode = GET_MODE (src); + gcc_assert (mode == SFmode || mode == V2SFmode || mode == V4SFmode + || mode == DFmode || mode == V2DFmode); + + rtx xsrc = gen_reg_rtx (mode); + emit_move_insn (xsrc, src); + + rtx xdst = gen_reg_rtx (mode); + + /* Calculate the approximate square root by multiplying the original operand + by its approximate reciprocal square root. */ + aarch64_emit_swrsqrt (xdst, xsrc); + emit_set_insn (dst, gen_rtx_MULT (mode, xdst, src)); +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index d9fe1ae..d5930b9 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4534,7 +4534,23 @@ [(set_attr "type" "ffarith")] ) -(define_insn "sqrt2" +(define_expand "sqrt2" + [(set (match_operand:GPF 0 "register_operand") + (sqrt:GPF (match_operand:GPF 1 "register_operand")))] + "TARGET_SIMD" +{ + if ((AARCH64_EXTRA_TUNE_FAST_SQRT & aarch64_tune_params.extra_tuning_flags) + && !optimize_function_for_size_p (cfun) + && flag_finite_math_only + && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + aarch64_emit_swsqrt (operands[0], operands[1]); + DONE; + } +}) + +(define_insn "*sqrt2" [(set (match_operand:GPF 0 "register_operand" "=w") (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))] "TARGET_FLOAT" diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index a0fbfd42..d02c5e8 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -151,5 +151,5 @@ PC relative literal loads. mlow-precision-recip-sqrt Common Var(flag_mrecip_low_precision_sqrt) Optimization -When calculating a sqrt approximation, run fewer steps. +Calculate the square-root or its reciprocal approximation in fewer steps. This reduces precision, but can result in faster computation. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 5ab565c..f4a47a6 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -6141,7 +6141,7 @@ is usable even in freestanding environments. @opindex fsanitize-coverage=trace-pc Enable coverage-guided fuzzing code instrumentation. Inserts call to __sanitizer_cov_trace_pc into every basic block. - +- @item -fcheck-pointer-bounds @opindex fcheck-pointer-bounds @opindex fno-check-pointer-bounds @@ -12561,12 +12561,11 @@ corresponding flag to the linker. @item -mno-low-precision-recip-sqrt @opindex -mlow-precision-recip-sqrt @opindex -mno-low-precision-recip-sqrt -The square root estimate uses two steps instead of three for double-precision, -and one step instead of two for single-precision. -Thus reducing latency and precision. -This is only relevant if @option{-ffast-math} activates -reciprocal square root estimate instructions. -Which in turn depends on the target processor. +The square root and its reciprocal approximation use one step less than +otherwise, thus reducing latency and precision. +This is only relevant if @option{-ffast-math} enables +the square root or its reciprocal approximation, +which in turn depends on the target processor. @item -march=@var{name} @opindex march -- 1.9.1