From 2cb6c0f35bbdc3b4cc6f88c61a50f3fbb168ec99 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Thu, 3 Mar 2016 18:13:46 -0600 Subject: [PATCH 1/3] [AArch64] Add more choices for the reciprocal square root approximation Allow a target to prefer such operation depending on the operation mode. 2016-03-03 Evandro Menezes gcc/ * config/aarch64/aarch64-protos.h (AARCH64_APPROX_MODE): New macro. (AARCH64_APPROX_{NONE,SP,DP,DFORM,QFORM,SCALAR,VECTOR,ALL}): Likewise. (tune_params): New member "approx_rsqrt_modes". * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro. * config/aarch64/aarch64.c (generic_tunings): New member "approx_rsqrt_modes". (cortexa35_tunings): Likewise. (cortexa53_tunings): Likewise. (cortexa57_tunings): Likewise. (cortexa72_tunings): Likewise. (exynosm1_tunings): Likewise. (thunderx_tunings): Likewise. (xgene1_tunings): Likewise. (use_rsqrt_p): New argument for the mode and use new member from "tune_params". (aarch64_builtin_reciprocal): Devise mode from builtin. (aarch64_optab_supported_p): New argument for the mode. * doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description. --- gcc/config/aarch64/aarch64-protos.h | 27 ++++++++++++++++++++ gcc/config/aarch64/aarch64-tuning-flags.def | 2 -- gcc/config/aarch64/aarch64.c | 39 ++++++++++++++++++----------- gcc/doc/invoke.texi | 2 +- 4 files changed, 53 insertions(+), 17 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index f22a31c..50f1d24 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -178,6 +178,32 @@ struct cpu_branch_cost const int unpredictable; /* Unpredictable branch or optimizing for speed. */ }; +/* Control approximate alternatives to certain FP operators. */ +#define AARCH64_APPROX_MODE(MODE) \ + ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \ + ? (1 << ((MODE) - MIN_MODE_FLOAT)) \ + : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \ + ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \ + + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \ + : (0)) +#define AARCH64_APPROX_NONE (0) +#define AARCH64_APPROX_SP (AARCH64_APPROX_MODE (SFmode) \ + | AARCH64_APPROX_MODE (V2SFmode) \ + | AARCH64_APPROX_MODE (V4SFmode)) +#define AARCH64_APPROX_DP (AARCH64_APPROX_MODE (DFmode) \ + | AARCH64_APPROX_MODE (V2DFmode)) +#define AARCH64_APPROX_DFORM (AARCH64_APPROX_MODE (SFmode) \ + | AARCH64_APPROX_MODE (DFmode) \ + | AARCH64_APPROX_MODE (V2SFmode)) +#define AARCH64_APPROX_QFORM (AARCH64_APPROX_MODE (V4SFmode) \ + | AARCH64_APPROX_MODE (V2DFmode)) +#define AARCH64_APPROX_SCALAR (AARCH64_APPROX_MODE (SFmode) \ + | AARCH64_APPROX_MODE (DFmode)) +#define AARCH64_APPROX_VECTOR (AARCH64_APPROX_MODE (V2SFmode) \ + | AARCH64_APPROX_MODE (V4SFmode) \ + | AARCH64_APPROX_MODE (V2DFmode)) +#define AARCH64_APPROX_ALL (-1) + struct tune_params { const struct cpu_cost_table *insn_extra_cost; @@ -218,6 +244,7 @@ struct tune_params } autoprefetcher_model; unsigned int extra_tuning_flags; + unsigned int approx_rsqrt_modes; }; #define AARCH64_FUSION_PAIR(x, name) \ diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 7e45a0c..048c2a3 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -29,5 +29,3 @@ AARCH64_TUNE_ to give an enum name. */ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) -AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT) - diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 9995494..68381bf 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -38,6 +38,7 @@ #include "recog.h" #include "diagnostic.h" #include "insn-attr.h" +#include "insn-modes.h" #include "alias.h" #include "fold-const.h" #include "stor-layout.h" @@ -414,7 +415,8 @@ static const struct tune_params generic_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; static const struct tune_params cortexa35_tunings = @@ -439,7 +441,8 @@ static const struct tune_params cortexa35_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; static const struct tune_params cortexa53_tunings = @@ -464,7 +467,8 @@ static const struct tune_params cortexa53_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; static const struct tune_params cortexa57_tunings = @@ -489,7 +493,8 @@ static const struct tune_params cortexa57_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ + (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; static const struct tune_params cortexa72_tunings = @@ -514,7 +519,8 @@ static const struct tune_params cortexa72_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; static const struct tune_params exynosm1_tunings = @@ -538,7 +544,8 @@ static const struct tune_params exynosm1_tunings = 48, /* max_case_values. */ 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */ }; static const struct tune_params thunderx_tunings = @@ -562,7 +569,8 @@ static const struct tune_params thunderx_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; static const struct tune_params xgene1_tunings = @@ -586,7 +594,8 @@ static const struct tune_params xgene1_tunings = 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */ }; /* Support for fine-grained override of the tuning structures. */ @@ -7452,12 +7461,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, to optimize 1.0/sqrt. */ static bool -use_rsqrt_p (void) +use_rsqrt_p (machine_mode mode) { return (!flag_trapping_math && flag_unsafe_math_optimizations - && ((aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_APPROX_RSQRT) + && ((aarch64_tune_params.approx_rsqrt_modes + & AARCH64_APPROX_MODE (mode)) || flag_mrecip_low_precision_sqrt)); } @@ -7467,7 +7476,9 @@ use_rsqrt_p (void) static tree aarch64_builtin_reciprocal (tree fndecl) { - if (!use_rsqrt_p ()) + machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl)); + + if (!use_rsqrt_p (mode)) return NULL_TREE; return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl)); } @@ -13889,13 +13900,13 @@ aarch64_promoted_type (const_tree t) /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */ static bool -aarch64_optab_supported_p (int op, machine_mode, machine_mode, +aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode, optimization_type opt_type) { switch (op) { case rsqrt_optab: - return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (); + return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1); default: return true; diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index f1ac257..4340b08 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -12939,7 +12939,7 @@ corresponding flag to the linker. When calculating the reciprocal square root approximation, uses one less step than otherwise, thus reducing latency and precision. This is only relevant if @option{-ffast-math} enables the reciprocal square root -approximation, which in turn depends on the target processor. +approximation. @item -march=@var{name} @opindex march -- 2.6.3