public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] 2015-10-02  Benedikt Huber  <benedikt.huber@theobroma-systems.com>      Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
  2015-10-03 13:15 [PATCH v6][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
@ 2015-10-03 13:15 ` Benedikt Huber
  2015-10-09 11:45   ` James Greenhalgh
  2015-10-08 18:45 ` [PATCH v6][aarch64] " Evandro Menezes
  1 sibling, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-03 13:15 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	Benedikt Huber

	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
	rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for
	frsqrte and frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added
	MRECIP_DEFAULT_ENABLED.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt
	estimation code in fast math mode.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added options -mrecip and
	-mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
	for frsqrte and frsqrts
	* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  19 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 112 ++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   3 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 115 ++++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   4 +
 gcc/doc/invoke.texi                                |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  65 ++++++++++++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 11 files changed, 467 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c8200db..7226f29 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,22 @@
+2015-10-02  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
+	rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for
+	frsqrte and frsqrts.
+	* config/aarch64/aarch64-tuning-flags.def: Added
+	RECIP_SQRT.
+	* config/aarch64/aarch64.c: New functions.  Emit rsqrt
+	estimation code in fast math mode.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added options -mrecip and
+	-mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
+	for frsqrte and frsqrts
+	* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
+
 2015-10-01  Lynn Boger  <laboger@linux.vnet.ibm.com>
 
 	PR target/66870
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 80916a9..29cfbf5 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -344,6 +344,11 @@ enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -842,6 +847,46 @@ aarch64_init_crc32_builtins ()
     }
 }
 
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_add_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  ftype = build_function_type_list (double_type_node, double_type_node,
+				    NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
+    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
+
+  ftype = build_function_type_list (float_type_node, float_type_node,
+				    NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
+    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
+
+  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
+    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
+
+  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
+    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
+
+  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
+    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -873,6 +918,7 @@ aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
+  aarch64_add_builtin_rsqrt ();
 }
 
 tree
@@ -1136,6 +1182,41 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  enum insn_code c;
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+	c = CODE_FOR_rsqrt_df2; break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+	c = CODE_FOR_rsqrt_sf2; break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+	c = CODE_FOR_rsqrt_v2df2; break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+	c = CODE_FOR_rsqrt_v2sf2; break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+	c = CODE_FOR_rsqrt_v4sf2; break;
+	  default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  pat = GEN_FCN (c) (target, op0);
+  emit_insn (pat);
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1183,6 +1264,13 @@ aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1340,6 +1428,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..455b1da 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
@@ -400,4 +402,5 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode);
 bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
 bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
 bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
+tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 541faf9..d48ad3b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -354,6 +354,33 @@
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+	       (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 628386b..6f7dbce 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 034da7c..5ddfa5d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7009,6 +7010,107 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+			    bool md_fn,
+			    bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || (aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_rsqrte_df2;
+    case SFmode:   return gen_rsqrte_sf2;
+    case V2DFmode: return gen_rsqrte_v2df2;
+    case V2SFmode: return gen_rsqrte_v2sf2;
+    case V4SFmode: return gen_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_rsqrts_df3;
+    case SFmode:   return gen_rsqrts_sf3;
+    case V2DFmode: return gen_rsqrts_v2df3;
+    case V2SFmode: return gen_rsqrts_v2sf3;
+    case V4SFmode: return gen_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
+    mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = 2;
+  if (double_mode)
+    iterations = 3;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -13387,6 +13489,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_BUILD_BUILTIN_VA_LIST
 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
 
+#undef TARGET_BUILTIN_DECL
+#define TARGET_BUILTIN_DECL aarch64_builtin_decl
+
 #undef TARGET_CALLEE_COPIES
 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
 
@@ -13418,9 +13523,6 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_CLASS_MAX_NREGS
 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
 
-#undef TARGET_BUILTIN_DECL
-#define TARGET_BUILTIN_DECL aarch64_builtin_decl
-
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
@@ -13561,6 +13663,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c3cd58d..51c2b87 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a1ce58d..00084ea 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,7 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 mpc-relative-literal-loads
 Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+Run fewer approximation steps to reduce latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c19be78..8b45837 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -519,6 +519,7 @@ Objective-C and Objective-C++ Dialects}.
 -mtls-size=@var{size} @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12445,6 +12446,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the CPU core.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
new file mode 100644
index 0000000..9f17990
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
@@ -0,0 +1,65 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */
+
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+  TYPE c; \
+  TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+  s4_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  o.c = 1.0 / sqrt_##TYPE (i.c); \
+  o.d = 1.0 / sqrt_##TYPE (i.d); \
+  return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+  s2_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+  return 1.0 / sqrt_##TYPE (i); \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
new file mode 100644
index 0000000..624f9b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
@@ -0,0 +1,111 @@
+/* Test for the recip_sqrt tuning
+   ensuring functionality and sufficient accuracy.  */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+ *       according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+ *       according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+  return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+  return (a == b || \
+   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+  TYPE r = rsqrt_##TYPE (a); \
+  if (!equals_##TYPE (r, result)) \
+  { \
+    abort (); \
+  } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+  double nan = __builtin_nan ("");
+  double inf = __builtin_inf ();
+  float nanf = __builtin_nanf ("");
+  float inff = __builtin_inff ();
+
+  t_double (1.0/256, 0X1.00000000000000P+4);
+  t_double (1.0, 0X1.00000000000000P+0);
+  t_double (-1.0, nan);
+  t_double (11.0, 0X1.34BF63D1568260P-2);
+  t_double (0.0,  inf);
+  t_double (inf, 0X0.00000000000000P+0);
+  t_double (nan, nan);
+  t_double (-nan, -nan);
+  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+  t_double (PI, 0X1.20DD750429B6D0P-1);
+  t_double (PI_4, 0X1.20DD750429B6D0P+0);
+  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+  t_double (-PI, nan);
+  t_double (-SQRT2, nan);
+  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+  t_float (1.0/256, 0X1.00000000000000P+4);
+  t_float (1.0, 0X1.00000000000000P+0);
+  t_float (-1.0, nanf);
+  t_float (11.0, 0X1.34BF6400000000P-2);
+  t_float (0.0,  inff);
+  t_float (inff, 0X0.00000000000000P+0);
+  t_float (nanf, nanf);
+  t_float (-nanf, -nanf);
+  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+  t_float (PI, 0X1.20DD7400000000P-1);
+  t_float (PI_4, 0X1.20DD7400000000P+0);
+  t_float (SQRT2, 0X1.AE89FA00000000P-1);
+  t_float (SQRT1_2, 0X1.306FE000000000P+0);
+  t_float (-PI, nanf);
+  t_float (-SQRT2, nanf);
+  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+//   With -ffast-math these return positive INF.
+//   t_double (-0.0, -inf);
+//   t_float (-0.0, -inff);
+
+//   The reason here is that -ffast-math flushes to zero.
+//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
+//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);
+
+  return 0;
+}
-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v6][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math
@ 2015-10-03 13:15 Benedikt Huber
  2015-10-03 13:15 ` [PATCH] 2015-10-02 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
  2015-10-08 18:45 ` [PATCH v6][aarch64] " Evandro Menezes
  0 siblings, 2 replies; 23+ messages in thread
From: Benedikt Huber @ 2015-10-03 13:15 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	Benedikt Huber

This sixth revision of the patch:
 * Cleans up style issues.
 * Makes test conform to standards.

Ok for check in.

Benedikt Huber (1):
  2015-10-02  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  19 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 112 ++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   3 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 115 ++++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   4 +
 gcc/doc/invoke.texi                                |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  65 ++++++++++++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 11 files changed, 467 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c

-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v6][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math
  2015-10-03 13:15 [PATCH v6][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
  2015-10-03 13:15 ` [PATCH] 2015-10-02 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
@ 2015-10-08 18:45 ` Evandro Menezes
  1 sibling, 0 replies; 23+ messages in thread
From: Evandro Menezes @ 2015-10-08 18:45 UTC (permalink / raw)
  To: Benedikt Huber, gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, kyrylo.tkachov,
	Richard.Earnshaw, marcus.shawcroft

LGTM

-- 
Evandro Menezes

On 01/02/1970 09:27 PM, Benedikt Huber wrote:
> This sixth revision of the patch:
>   * Cleans up style issues.
>   * Makes test conform to standards.
>
> Ok for check in.
>
> Benedikt Huber (1):
>    2015-10-02  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
>      	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
>
>   gcc/ChangeLog                                      |  19 ++++
>   gcc/config/aarch64/aarch64-builtins.c              | 112 ++++++++++++++++++++
>   gcc/config/aarch64/aarch64-protos.h                |   3 +
>   gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
>   gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
>   gcc/config/aarch64/aarch64.c                       | 115 ++++++++++++++++++++-
>   gcc/config/aarch64/aarch64.md                      |   3 +
>   gcc/config/aarch64/aarch64.opt                     |   4 +
>   gcc/doc/invoke.texi                                |  12 +++
>   .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  65 ++++++++++++
>   gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
>   11 files changed, 467 insertions(+), 5 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
>   create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-02  Benedikt Huber <benedikt.huber@theobroma-systems.com>      Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-03 13:15 ` [PATCH] 2015-10-02 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
@ 2015-10-09 11:45   ` James Greenhalgh
  2015-10-15 22:04     ` [PATCH v7][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
  0 siblings, 1 reply; 23+ messages in thread
From: James Greenhalgh @ 2015-10-09 11:45 UTC (permalink / raw)
  To: Benedikt Huber
  Cc: gcc-patches, philipp.tomsich, Venkataramanan.Kumar, pinskia,
	e.menezes, Kyrylo Tkachov, Richard.Earnshaw, marcus.shawcroft

On Sat, Jan 03, 1970 at 03:27:04AM +0000, Benedikt Huber wrote:
>         * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
>         rsqrtf.
>         * config/aarch64/aarch64-protos.h: Declare.
>         * config/aarch64/aarch64-simd.md: Matching expressions for
>         frsqrte and frsqrts.
>         * config/aarch64/aarch64-tuning-flags.def: Added
>         MRECIP_DEFAULT_ENABLED.
>         * config/aarch64/aarch64.c: New functions. Emit rsqrt
>         estimation code in fast math mode.
>         * config/aarch64/aarch64.md: Added enum entries.
>         * config/aarch64/aarch64.opt: Added options -mrecip and
>         -mlow-precision-recip-sqrt.
>         * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>         for frsqrte and frsqrts
>         * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.


Hi,

Thanks for this latest revision, I have some structural/refactoring
comments, and I think I've spotted a bug. Otherwise this is getting
close to ready.

Some more comments in line.

(As an asside, I find this style of patch submission to be very
 difficult to follow, as it misses my mail filters and does not keep the
 in-reply-to header correctly across patch revisions).

>  2015-10-01  Lynn Boger  <laboger@linux.vnet.ibm.com>
> 
>         PR target/66870
> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index 80916a9..29cfbf5 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -344,6 +344,11 @@ enum aarch64_builtins
>    AARCH64_BUILTIN_GET_FPSR,
>    AARCH64_BUILTIN_SET_FPSR,
> 
> +  AARCH64_BUILTIN_RSQRT_DF,
> +  AARCH64_BUILTIN_RSQRT_SF,
> +  AARCH64_BUILTIN_RSQRT_V2DF,
> +  AARCH64_BUILTIN_RSQRT_V2SF,
> +  AARCH64_BUILTIN_RSQRT_V4SF,
>    AARCH64_SIMD_BUILTIN_BASE,
>    AARCH64_SIMD_BUILTIN_LANE_CHECK,
>  #include "aarch64-simd-builtins.def"
> @@ -842,6 +847,46 @@ aarch64_init_crc32_builtins ()
>      }
>  }
> 
> +/* Add builtins for reciprocal square root.  */
> +
> +void
> +aarch64_add_builtin_rsqrt (void)
> +{
> +  tree fndecl = NULL;
> +  tree ftype = NULL;
> +
> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
> +
> +  ftype = build_function_type_list (double_type_node, double_type_node,
> +                                   NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
> +    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
> +
> +  ftype = build_function_type_list (float_type_node, float_type_node,
> +                                   NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
> +
> +  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
> +    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
> +
> +  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
> +
> +  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;

Given that this is all so mechanical, I'd have a preference towards
refactoring this to loop over some structured data. Something like:

  {AARCH64_BUILTIN_RSQRT_SF, float_type_node, "__builtin_aarch64_rsqrt_sf"},
  {AARCH64_BUILTIN_RSQRT_DF, double_type_node, "__builtin_aarch64_rsqrt_df"},
  etc.

>  void
>  aarch64_init_builtins (void)
>  {
> @@ -873,6 +918,7 @@ aarch64_init_builtins (void)
>      aarch64_init_simd_builtins ();
> 
>    aarch64_init_crc32_builtins ();
> +  aarch64_add_builtin_rsqrt ();

Very minor nit, other functions use "init", you use "add".

>  }
> 
>  tree
> @@ -1136,6 +1182,41 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
>    return target;
>  }
> 
> +/* Function to expand reciprocal square root builtins.  */
> +
> +static rtx
> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
> +{
> +  rtx pat;
> +  tree arg0 = CALL_EXPR_ARG (exp, 0);
> +  rtx op0 = expand_normal (arg0);
> +
> +  enum insn_code c;
> +
> +  switch (fcode)
> +    {
> +      case AARCH64_BUILTIN_RSQRT_DF:
> +       c = CODE_FOR_rsqrt_df2; break;
> +      case AARCH64_BUILTIN_RSQRT_SF:
> +       c = CODE_FOR_rsqrt_sf2; break;
> +      case AARCH64_BUILTIN_RSQRT_V2DF:
> +       c = CODE_FOR_rsqrt_v2df2; break;
> +      case AARCH64_BUILTIN_RSQRT_V2SF:
> +       c = CODE_FOR_rsqrt_v2sf2; break;
> +      case AARCH64_BUILTIN_RSQRT_V4SF:
> +       c = CODE_FOR_rsqrt_v4sf2; break;
> +         default: gcc_unreachable ();
> +    }

Formatting looks off for the "default" case.

> +
> +  if (!target)
> +    target = gen_reg_rtx (GET_MODE (op0));
> +
> +  pat = GEN_FCN (c) (target, op0);
> +  emit_insn (pat);
> +
> +  return target;


Could we rewrite the above using function pointers and gen functions as
you do elsewhere in the patch:

  rtx (*gen) (rtx, rtx);
  switch (fcode)
    {
      case AARCH64_BUILTIN_RSQRT_DF:
	gen = gen_rsqrt_df2;
	break;
      case AARCH64_BUILTIN_RSQRT_SF:
	gen = gen_rsqrt_sf2;
	break;
       <...>
    }
  emit_insn (gen (target, op0));
  
>  /* Expand an expression EXP that calls a built-in function,
>     with result going to TARGET if that's convenient.  */
>  rtx
> @@ -1183,6 +1264,13 @@ aarch64_expand_builtin (tree exp,
>    else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
>      return aarch64_crc32_expand_builtin (fcode, exp, target);
> 
> +  if (fcode == AARCH64_BUILTIN_RSQRT_DF
> +      || fcode == AARCH64_BUILTIN_RSQRT_SF
> +      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
> +      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
> +      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
> +    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
> +
>    gcc_unreachable ();
>  }
> 
> @@ -1340,6 +1428,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
>    return NULL_TREE;
>  }
> 
> +/* Return builtin for reciprocal square root.  */
> +
> +tree
> +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
> +{
> +  if (md_fn)
> +    {
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
> +    }
> +  else
> +    {
> +      if (fn == BUILT_IN_SQRT)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
> +      if (fn == BUILT_IN_SQRTF)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
> +    }
> +  return NULL_TREE;
> +}
> +
>  #undef VAR1
>  #define VAR1(T, N, MAP, A) \
>    case AARCH64_SIMD_BUILTIN_##T##_##N##A:
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index baaf1bd..455b1da 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
>  void aarch64_relayout_simd_types (void);
>  void aarch64_reset_previous_fndecl (void);
> 
> +void aarch64_emit_swrsqrt (rtx, rtx);
> +
>  /* Initialize builtins for SIMD intrinsics.  */
>  void init_aarch64_simd_builtins (void);
> 
> @@ -400,4 +402,5 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode);
>  bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
>  bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
>  bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
> +tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);

It is a losing battle, but at some point this file was in alphabetical
order, first by type then by name. If we could keep to that, that would
be good.

>  #endif /* GCC_AARCH64_PROTOS_H */
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 541faf9..d48ad3b 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -354,6 +354,33 @@
>    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
>  )
> 
> +(define_insn "rsqrte_<mode>2"

As this is not a standard pattern name, keep it namespaced as
aarch64_rsqrte<mode>2.

> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRTE))]
> +  "TARGET_SIMD"
> +  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
> +
> +(define_insn "rsqrts_<mode>3"

Likewise.

> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
> +              (match_operand:VALLF 2 "register_operand" "w")]
> +                    UNSPEC_RSQRTS))]
> +  "TARGET_SIMD"
> +  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
> +
> +(define_expand "rsqrt_<mode>2"

Likewise.

> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRT))]
> +  "TARGET_SIMD"
> +{
> +  aarch64_emit_swrsqrt (operands[0], operands[1]);
> +  DONE;
> +})
> +
>  (define_insn "*aarch64_mul3_elt_to_64v2df"
>    [(set (match_operand:DF 0 "register_operand" "=w")
>       (mult:DF
> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
> index 628386b..6f7dbce 100644
> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> @@ -29,4 +29,5 @@
>       AARCH64_TUNE_ to give an enum name. */
> 
>  AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
> +AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 034da7c..5ddfa5d 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings =
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> -  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
> +   | AARCH64_EXTRA_TUNE_RECIP_SQRT)    /* tune_flags.  */
>  };
> 
>  static const struct tune_params cortexa72_tunings =
> @@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings =
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> -  (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_RECIP_SQRT)      /* tune_flags.  */
>  };
> 
>  /* Support for fine-grained override of the tuning structures.  */
> @@ -7009,6 +7010,107 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
>    return aarch64_tune_params.memmov_cost;
>  }
> 
> +/* Function to decide when to use
> +   reciprocal square root builtins.  */
> +
> +static tree
> +aarch64_builtin_reciprocal (unsigned int fn,
> +                           bool md_fn,
> +                           bool)
> +{
> +  if (flag_trapping_math
> +      || !flag_unsafe_math_optimizations
> +      || optimize_size
> +      || (aarch64_tune_params.extra_tuning_flags
> +          & AARCH64_EXTRA_TUNE_RECIP_SQRT))

I've checked a number of times, but this condition still looks backwards
to me. As far as I can see, this says not to do the transform if

      (aarch64_tune_params.extra_tuning_flags
          & AARCH64_EXTRA_TUNE_RECIP_SQRT))

But it is Friday, so forgive me if I'm wrong.

> +  {
> +    return NULL_TREE;
> +  }
> +
> +  return aarch64_builtin_rsqrt (fn, md_fn);
> +}
> +
> +typedef rtx (*rsqrte_type) (rtx, rtx);
> +
> +/* Select reciprocal square root initial estimate
> +   insn depending on machine mode.  */
> +
> +rsqrte_type
> +get_rsqrte_type (enum machine_mode mode)
> +{
> +  switch (mode)
> +  {
> +    case DFmode:   return gen_rsqrte_df2;
> +    case SFmode:   return gen_rsqrte_sf2;
> +    case V2DFmode: return gen_rsqrte_v2df2;
> +    case V2SFmode: return gen_rsqrte_v2sf2;
> +    case V4SFmode: return gen_rsqrte_v4sf2;
> +    default: gcc_unreachable ();
> +  }
> +}
> +
> +typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
> +
> +/* Select reciprocal square root Newton-Raphson step
> +   insn depending on machine mode.  */
> +
> +rsqrts_type
> +get_rsqrts_type (enum machine_mode mode)
> +{
> +  switch (mode)
> +  {
> +    case DFmode:   return gen_rsqrts_df3;
> +    case SFmode:   return gen_rsqrts_sf3;
> +    case V2DFmode: return gen_rsqrts_v2df3;
> +    case V2SFmode: return gen_rsqrts_v2sf3;
> +    case V4SFmode: return gen_rsqrts_v4sf3;
> +    default: gcc_unreachable ();
> +  }
> +}
> +
> +/* Emit instruction sequence to compute
> +   reciprocal square root.  Use two Newton-Raphson steps
> +   for single precision and three for double precision.  */
> +
> +void
> +aarch64_emit_swrsqrt (rtx dst, rtx src)
> +{
> +  enum machine_mode mode = GET_MODE (src);
> +  gcc_assert (
> +    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
> +    mode == DFmode || mode == V2DFmode);

Split before the operator:

   mode == SFmode || mode == V2SFmode || mode == V4SFmode
   || mode == DFmode || mode == V2DFmode);

> +
> +  rtx xsrc = gen_reg_rtx (mode);
> +  emit_move_insn (xsrc, src);
> +  rtx x0 = gen_reg_rtx (mode);
> +
> +  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
> +
> +  bool double_mode = (mode == DFmode || mode == V2DFmode);
> +
> +  int iterations = 2;
> +  if (double_mode)
> +    iterations = 3;

Personal preference:

  int iterations = double_mode ? 3 : 2;

> +
> +  if (flag_mrecip_low_precision_sqrt)
> +    iterations--;
> +
> +  for (int i = 0; i < iterations; ++i)
> +    {
> +      rtx x1 = gen_reg_rtx (mode);
> +      rtx x2 = gen_reg_rtx (mode);
> +      rtx x3 = gen_reg_rtx (mode);
> +      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
> +
> +      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
> +
> +      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
> +      x0 = x1;
> +    }
> +
> +  emit_move_insn (dst, x0);
> +}
> +
>  /* Return the number of instructions that can be issued per cycle.  */
>  static int
>  aarch64_sched_issue_rate (void)
> @@ -13387,6 +13489,9 @@ aarch64_promoted_type (const_tree t)
>  #undef TARGET_BUILD_BUILTIN_VA_LIST
>  #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
> 
> +#undef TARGET_BUILTIN_DECL
> +#define TARGET_BUILTIN_DECL aarch64_builtin_decl
> +

Unrelated change?

>  #undef TARGET_CALLEE_COPIES
>  #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
> 
> @@ -13418,9 +13523,6 @@ aarch64_promoted_type (const_tree t)
>  #undef TARGET_CLASS_MAX_NREGS
>  #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
> 
> -#undef TARGET_BUILTIN_DECL
> -#define TARGET_BUILTIN_DECL aarch64_builtin_decl
> -
>  #undef  TARGET_EXPAND_BUILTIN
>  #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
> 
> @@ -13561,6 +13663,9 @@ aarch64_promoted_type (const_tree t)
>  #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
>  #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
> 
> +#undef TARGET_BUILTIN_RECIPROCAL
> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
> +

I think Marcus asked for this to be in alphabetical order in the v5
review.

>  #undef TARGET_VECTOR_MODE_SUPPORTED_P
>  #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c3cd58d..51c2b87 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -126,6 +126,9 @@
>      UNSPEC_VSTRUCTDUMMY
>      UNSPEC_SP_SET
>      UNSPEC_SP_TEST
> +    UNSPEC_RSQRT
> +    UNSPEC_RSQRTE
> +    UNSPEC_RSQRTS
>  ])
> 
>  (define_c_enum "unspecv" [
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index a1ce58d..00084ea 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -148,3 +148,7 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
>  mpc-relative-literal-loads
>  Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
>  PC relative literal loads.
> +
> +mlow-precision-recip-sqrt
> +Common Var(flag_mrecip_low_precision_sqrt) Optimization
> +Run fewer approximation steps to reduce latency and precision.

Don't make a definite claim about latency here.

  When calculating a sqrt approximation, run fewer steps.  This reduces
  precision, but can result in faster computation.

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index c19be78..8b45837 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -519,6 +519,7 @@ Objective-C and Objective-C++ Dialects}.
>  -mtls-size=@var{size} @gol
>  -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
>  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
> +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
>  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
> 
>  @emph{Adapteva Epiphany Options}
> @@ -12445,6 +12446,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
>  This erratum workaround is made at link time and this will only pass the
>  corresponding flag to the linker.
> 
> +@item -mlow-precision-recip-sqrt
> +@item -mno-low-precision-recip-sqrt
> +@opindex -mlow-precision-recip-sqrt
> +@opindex -mno-low-precision-recip-sqrt
> +The square root estimate uses two steps instead of three for double-precision,
> +and one step instead of two for single-precision.
> +Thus reducing latency and precision.
> +This is only relevant if @option{-ffast-math} activates
> +reciprocal square root estimate instructions.
> +Which in turn depends on the CPU core.

As above. To be consistent with the other documentation,
s/CPU core/target processor/

> +
>  @item -march=@var{name}
>  @opindex march
>  Specify the name of the target architecture, optionally suffixed by one or
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
> new file mode 100644
> index 0000000..9f17990
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
> @@ -0,0 +1,65 @@
> +/* Test for the recip_sqrt tuning
> +   ensuring the correct instructions are generated.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */

Presumably this testcase needs guarding or otherwise tweaked to make sure
it only runs for targets which want to use the estimate expansion?
Additionally, a test showing the opposite - that a target which does
not want the expansion doesn't get it - would be useful.

> +
> +#define sqrt_float   __builtin_sqrtf
> +#define sqrt_double  __builtin_sqrt
> +
> +#define TESTTYPE(TYPE) \
> +typedef struct { \
> +  TYPE a; \
> +  TYPE b; \
> +  TYPE c; \
> +  TYPE d; \
> +} s4_##TYPE; \
> +\
> +typedef struct { \
> +  TYPE a; \
> +  TYPE b; \
> +} s2_##TYPE; \
> +\
> +s4_##TYPE \
> +rsqrtv4_##TYPE (s4_##TYPE i) \
> +{ \
> +  s4_##TYPE o; \
> +  o.a = 1.0 / sqrt_##TYPE (i.a); \
> +  o.b = 1.0 / sqrt_##TYPE (i.b); \
> +  o.c = 1.0 / sqrt_##TYPE (i.c); \
> +  o.d = 1.0 / sqrt_##TYPE (i.d); \
> +  return o; \
> +} \
> +\
> +s2_##TYPE \
> +rsqrtv2_##TYPE (s2_##TYPE i) \
> +{ \
> +  s2_##TYPE o; \
> +  o.a = 1.0 / sqrt_##TYPE (i.a); \
> +  o.b = 1.0 / sqrt_##TYPE (i.b); \
> +  return o; \
> +} \
> +\
> +TYPE \
> +rsqrt_##TYPE (TYPE i) \
> +{ \
> +  return 1.0 / sqrt_##TYPE (i); \
> +} \
> +
> +TESTTYPE (double)
> +TESTTYPE (float)
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
> +
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
> new file mode 100644
> index 0000000..624f9b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
> @@ -0,0 +1,111 @@
> +/* Test for the recip_sqrt tuning
> +   ensuring functionality and sufficient accuracy.  */
> +/* { dg-do run } */
> +/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */

Likewise.

> +
> +#define PI    3.141592653589793
> +#define SQRT2 1.4142135623730951
> +
> +#define PI_4 0.7853981633974483
> +#define SQRT1_2 0.7071067811865475
> +
> +/* 2^25+1, float has 24 significand bits
> + *       according to Single-precision floating-point format.  */
> +#define TESTA8_FLT 33554433
> +/* 2^54+1, double has 53 significand bits
> + *       according to Double-precision floating-point format.  */
> +#define TESTA8_DBL 18014398509481985
> +
> +#define EPSILON_double __DBL_EPSILON__
> +#define EPSILON_float __FLT_EPSILON__
> +#define ABS_double __builtin_fabs
> +#define ABS_float __builtin_fabsf
> +#define SQRT_double __builtin_sqrt
> +#define SQRT_float __builtin_sqrtf
> +#define ISNAN_double __builtin_isnan
> +#define ISNAN_float __builtin_isnanf
> +
> +extern void abort (void);
> +
> +#define TESTTYPE(TYPE) \
> +TYPE \
> +rsqrt_##TYPE (TYPE a) \
> +{ \
> +  return 1.0/SQRT_##TYPE (a); \
> +} \
> +\
> +int \
> +equals_##TYPE (TYPE a, TYPE b) \
> +{ \
> +  return (a == b || \
> +   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
> +   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
> +} \
> +\
> +void \
> +t_##TYPE (TYPE a, TYPE result) \
> +{ \
> +  TYPE r = rsqrt_##TYPE (a); \
> +  if (!equals_##TYPE (r, result)) \
> +  { \
> +    abort (); \
> +  } \
> +} \
> +
> +TESTTYPE (double)
> +TESTTYPE (float)
> +
> +int
> +main ()
> +{
> +  double nan = __builtin_nan ("");
> +  double inf = __builtin_inf ();
> +  float nanf = __builtin_nanf ("");
> +  float inff = __builtin_inff ();
> +
> +  t_double (1.0/256, 0X1.00000000000000P+4);
> +  t_double (1.0, 0X1.00000000000000P+0);
> +  t_double (-1.0, nan);
> +  t_double (11.0, 0X1.34BF63D1568260P-2);
> +  t_double (0.0,  inf);
> +  t_double (inf, 0X0.00000000000000P+0);
> +  t_double (nan, nan);
> +  t_double (-nan, -nan);
> +  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
> +  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
> +  t_double (PI, 0X1.20DD750429B6D0P-1);
> +  t_double (PI_4, 0X1.20DD750429B6D0P+0);
> +  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
> +  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
> +  t_double (-PI, nan);
> +  t_double (-SQRT2, nan);
> +  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
> +
> +  t_float (1.0/256, 0X1.00000000000000P+4);
> +  t_float (1.0, 0X1.00000000000000P+0);
> +  t_float (-1.0, nanf);
> +  t_float (11.0, 0X1.34BF6400000000P-2);
> +  t_float (0.0,  inff);
> +  t_float (inff, 0X0.00000000000000P+0);
> +  t_float (nanf, nanf);
> +  t_float (-nanf, -nanf);
> +  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
> +  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
> +  t_float (PI, 0X1.20DD7400000000P-1);
> +  t_float (PI_4, 0X1.20DD7400000000P+0);
> +  t_float (SQRT2, 0X1.AE89FA00000000P-1);
> +  t_float (SQRT1_2, 0X1.306FE000000000P+0);
> +  t_float (-PI, nanf);
> +  t_float (-SQRT2, nanf);
> +  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
> +
> +//   With -ffast-math these return positive INF.
> +//   t_double (-0.0, -inf);
> +//   t_float (-0.0, -inff);
> +
> +//   The reason here is that -ffast-math flushes to zero.
> +//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
> +//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);
> +
> +  return 0;
> +}


Thanks,
James 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] 2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>      Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
  2015-10-15 22:04     ` [PATCH v7][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
@ 2015-10-15 22:04       ` Benedikt Huber
  2015-10-16  9:31         ` Marcus Shawcroft
  2015-10-16 12:41         ` Oleg Endo
  0 siblings, 2 replies; 23+ messages in thread
From: Benedikt Huber @ 2015-10-15 22:04 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, Benedikt Huber

	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
	frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
	applicable.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h: Common macros for
	assembly checks.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c: Make sure
	frsqrts and frsqrte are not emitted.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c: Make sure frsqrts and
	frsqrte are emitted.
	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 114 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check-common.h    |  42 ++++++++
 .../aarch64/rsqrt-asm-check-negative_1.c           |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  25 +++++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 13 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a865043..be096c6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@
+2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
+	frsqrts.
+	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
+	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
+	applicable.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h: Common macros for
+	assembly checks.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c: Make sure
+	frsqrts and frsqrte are not emitted.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c: Make sure frsqrts and
+	frsqrte are emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
+
 2015-10-11  Jan Hubicka  <hubicka@ucw.cz>
 
 	revert:
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 716ed6e..e5e62ac 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -344,6 +344,11 @@ enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -842,6 +847,45 @@ aarch64_init_crc32_builtins ()
     }
 }
 
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_init_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  typedef struct
+  {
+    tree type_node;
+    const char *builtin_name;
+    int function_code;
+  } builtin_decls_data;
+
+  builtin_decls_data bdda[] = {
+    {double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF},
+    {float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF},
+    {V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF},
+    {V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF},
+    {V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF}
+  };
+
+  builtin_decls_data *bdd = bdda;
+  builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+
+  for (; bdd < bdd_end; bdd++)
+  {
+    ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+    fndecl = add_builtin_function (bdd->builtin_name,
+      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+    aarch64_builtin_decls[bdd->function_code] = fndecl;
+  }
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -873,6 +917,7 @@ aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
+  aarch64_init_builtin_rsqrt ();
 }
 
 tree
@@ -1136,6 +1181,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  rtx (*gen) (rtx, rtx);
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+	gen = gen_aarch64_rsqrt_df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+	gen = gen_aarch64_rsqrt_sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+	gen = gen_aarch64_rsqrt_v2df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+	gen = gen_aarch64_rsqrt_v2sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+	gen = gen_aarch64_rsqrt_v4sf2;
+	break;
+      default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  emit_insn (gen (target, op0));
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1183,6 +1266,13 @@ aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1340,6 +1430,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..0420248 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
@@ -383,6 +385,8 @@ rtx aarch64_expand_builtin (tree exp,
 			    int ignore ATTRIBUTE_UNUSED);
 tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
 
+tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
+
 tree
 aarch64_builtin_vectorized_function (tree fndecl,
 				     tree type_out,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 167277e..8c359cb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -354,6 +354,33 @@
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "aarch64_rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "aarch64_rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+	       (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "aarch64_rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 628386b..6f7dbce 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5130e37..387d744 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7005,6 +7006,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+			    bool md_fn,
+			    bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || ! (aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrte_df2;
+    case SFmode:   return gen_aarch64_rsqrte_sf2;
+    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrts_df3;
+    case SFmode:   return gen_aarch64_rsqrts_sf3;
+    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode
+	|| mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = double_mode ? 3 : 2;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -13343,6 +13443,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 208f58f..48421d8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a1ce58d..e5691be 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 mpc-relative-literal-loads
 Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+When calculating a sqrt approximation, run fewer steps.
+This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9e028fc..c883b87 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -520,6 +520,7 @@ Objective-C and Objective-C++ Dialects}.
 -mtls-size=@var{size} @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12478,6 +12479,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the target processor.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
new file mode 100644
index 0000000..8a851e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
@@ -0,0 +1,42 @@
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+  TYPE c; \
+  TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+  s4_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  o.c = 1.0 / sqrt_##TYPE (i.c); \
+  o.d = 1.0 / sqrt_##TYPE (i.d); \
+  return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+  s2_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+  return 1.0 / sqrt_##TYPE (i); \
+} \
+
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
new file mode 100644
index 0000000..58fe7f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
@@ -0,0 +1,12 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic" } */
+
+#include "rsqrt-asm-check-common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte" 0 } } */
+/* { dg-final { scan-assembler-times "frsqrts" 0 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
new file mode 100644
index 0000000..72bf233
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
@@ -0,0 +1,25 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#include "rsqrt-asm-check-common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
new file mode 100644
index 0000000..15d495d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
@@ -0,0 +1,111 @@
+/* Test for the recip_sqrt tuning
+   ensuring functionality and sufficient accuracy.  */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+ *       according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+ *       according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+  return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+  return (a == b || \
+   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+  TYPE r = rsqrt_##TYPE (a); \
+  if (!equals_##TYPE (r, result)) \
+  { \
+    abort (); \
+  } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+  double nan = __builtin_nan ("");
+  double inf = __builtin_inf ();
+  float nanf = __builtin_nanf ("");
+  float inff = __builtin_inff ();
+
+  t_double (1.0/256, 0X1.00000000000000P+4);
+  t_double (1.0, 0X1.00000000000000P+0);
+  t_double (-1.0, nan);
+  t_double (11.0, 0X1.34BF63D1568260P-2);
+  t_double (0.0,  inf);
+  t_double (inf, 0X0.00000000000000P+0);
+  t_double (nan, nan);
+  t_double (-nan, -nan);
+  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+  t_double (PI, 0X1.20DD750429B6D0P-1);
+  t_double (PI_4, 0X1.20DD750429B6D0P+0);
+  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+  t_double (-PI, nan);
+  t_double (-SQRT2, nan);
+  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+  t_float (1.0/256, 0X1.00000000000000P+4);
+  t_float (1.0, 0X1.00000000000000P+0);
+  t_float (-1.0, nanf);
+  t_float (11.0, 0X1.34BF6400000000P-2);
+  t_float (0.0,  inff);
+  t_float (inff, 0X0.00000000000000P+0);
+  t_float (nanf, nanf);
+  t_float (-nanf, -nanf);
+  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+  t_float (PI, 0X1.20DD7400000000P-1);
+  t_float (PI_4, 0X1.20DD7400000000P+0);
+  t_float (SQRT2, 0X1.AE89FA00000000P-1);
+  t_float (SQRT1_2, 0X1.306FE000000000P+0);
+  t_float (-PI, nanf);
+  t_float (-SQRT2, nanf);
+  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+//   With -ffast-math these return positive INF.
+//   t_double (-0.0, -inf);
+//   t_float (-0.0, -inff);
+
+//   The reason here is that -ffast-math flushes to zero.
+//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
+//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);
+
+  return 0;
+}
-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v7][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math
  2015-10-09 11:45   ` James Greenhalgh
@ 2015-10-15 22:04     ` Benedikt Huber
  2015-10-15 22:04       ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
  0 siblings, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-15 22:04 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, Benedikt Huber

This seventh revision of the patch:
 * Cleans up style issues.
 * Correct bug in condition.
 * Improved testing code. 

Ok for check in.


Benedikt Huber (1):
  2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 114 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check-common.h    |  42 ++++++++
 .../aarch64/rsqrt-asm-check-negative_1.c           |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  25 +++++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 13 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c

-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-15 22:04       ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
@ 2015-10-16  9:31         ` Marcus Shawcroft
  2015-10-16 13:59           ` [PATCH v8][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
  2015-10-16 12:41         ` Oleg Endo
  1 sibling, 1 reply; 23+ messages in thread
From: Marcus Shawcroft @ 2015-10-16  9:31 UTC (permalink / raw)
  To: Benedikt Huber
  Cc: gcc-patches, philipp.tomsich, Venkataramanan.Kumar, pinskia,
	e.menezes, Kyrill Tkachov, Richard.Earnshaw, James Greenhalgh

Hi,

 A few more style nits:

> +  builtin_decls_data bdda[] = {

New line before  {

> +    {double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF},

Space after {
Space  before }

> +void aarch64_emit_swrsqrt (rtx, rtx);
> +

> +tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
> +

Drop the formal argument names as you did in the first declaration.

See my previous comment w.r.t the naming of new test cases in
gcc.target/aarch64, at least the following still need s/-/_/

> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c

> +//   With -ffast-math these return positive INF.
> +//   t_double (-0.0, -inf);
> +//   t_float (-0.0, -inff);
> +
> +//   The reason here is that -ffast-math flushes to zero.
> +//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
> +//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);

Comment consistently with the rest of the backend ie /* */

Thanks
/Marcus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>      Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
  2015-10-15 22:04       ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
  2015-10-16  9:31         ` Marcus Shawcroft
@ 2015-10-16 12:41         ` Oleg Endo
  2015-10-16 14:36           ` Benedikt Huber
  1 sibling, 1 reply; 23+ messages in thread
From: Oleg Endo @ 2015-10-16 12:41 UTC (permalink / raw)
  To: Benedikt Huber
  Cc: gcc-patches, philipp.tomsich, Venkataramanan.Kumar, pinskia,
	e.menezes, kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh

On Thu, 2015-10-15 at 22:03 +0000, Benedikt Huber wrote:
>  
> +/* Add builtins for reciprocal square root.  */
> +
> +void
> +aarch64_init_builtin_rsqrt (void)
> +{
> +  tree fndecl = NULL;
> +  tree ftype = NULL;
> +
> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
> +
> +  typedef struct
> +  {
> +    tree type_node;
> +    const char *builtin_name;
> +    int function_code;
> +  } builtin_decls_data;

There is an ongoing effort to remove all the unnecessary typedef struct
and enum etc stuff.  Please try not to add more of it.

Cheers,
Oleg

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] 2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>      Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
  2015-10-16 13:59           ` [PATCH v8][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
@ 2015-10-16 13:58             ` Benedikt Huber
  2015-10-16 14:31               ` Marcus Shawcroft
  0 siblings, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-16 13:58 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, Benedikt Huber

	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
	frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
	applicable.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
	assembly checks.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
	frsqrts and frsqrte are not emitted.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
	frsqrte are emitted.
	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 30860c4..2abe832 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@
+2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
+	frsqrts.
+	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
+	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
+	applicable.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
+	assembly checks.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
+	frsqrts and frsqrte are not emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
+	frsqrte are emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
+
 2015-10-14  Uros Bizjak  <ubizjak@gmail.com>
 
 	* config/mips/mips.h (MIPS_STACK_ALIGN): Implement using
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 716ed6e..0fb19a4 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -344,6 +344,11 @@ enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -842,6 +847,46 @@ aarch64_init_crc32_builtins ()
     }
 }
 
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_init_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  typedef struct
+  {
+    tree type_node;
+    const char *builtin_name;
+    int function_code;
+  } builtin_decls_data;
+
+  builtin_decls_data bdda[] =
+  {
+    { double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF },
+    { float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF },
+    { V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF },
+    { V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF },
+    { V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF }
+  };
+
+  builtin_decls_data *bdd = bdda;
+  builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+
+  for (; bdd < bdd_end; bdd++)
+  {
+    ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+    fndecl = add_builtin_function (bdd->builtin_name,
+      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+    aarch64_builtin_decls[bdd->function_code] = fndecl;
+  }
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -873,6 +918,7 @@ aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
+  aarch64_init_builtin_rsqrt ();
 }
 
 tree
@@ -1136,6 +1182,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  rtx (*gen) (rtx, rtx);
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+	gen = gen_aarch64_rsqrt_df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+	gen = gen_aarch64_rsqrt_sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+	gen = gen_aarch64_rsqrt_v2df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+	gen = gen_aarch64_rsqrt_v2sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+	gen = gen_aarch64_rsqrt_v4sf2;
+	break;
+      default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  emit_insn (gen (target, op0));
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1183,6 +1267,13 @@ aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1340,6 +1431,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..00775db 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
@@ -383,6 +385,8 @@ rtx aarch64_expand_builtin (tree exp,
 			    int ignore ATTRIBUTE_UNUSED);
 tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
 
+tree aarch64_builtin_rsqrt (unsigned int, bool);
+
 tree
 aarch64_builtin_vectorized_function (tree fndecl,
 				     tree type_out,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 167277e..8c359cb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -354,6 +354,33 @@
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "aarch64_rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "aarch64_rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+	       (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "aarch64_rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 628386b..6f7dbce 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5130e37..387d744 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7005,6 +7006,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+			    bool md_fn,
+			    bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || ! (aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrte_df2;
+    case SFmode:   return gen_aarch64_rsqrte_sf2;
+    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrts_df3;
+    case SFmode:   return gen_aarch64_rsqrts_sf3;
+    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode
+	|| mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = double_mode ? 3 : 2;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -13343,6 +13443,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 208f58f..48421d8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a1ce58d..e5691be 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 mpc-relative-literal-loads
 Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+When calculating a sqrt approximation, run fewer steps.
+This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 54e9f12..15e5a6d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -520,6 +520,7 @@ Objective-C and Objective-C++ Dialects}.
 -mtls-size=@var{size} @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12482,6 +12483,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the target processor.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
new file mode 100644
index 0000000..d235be8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
@@ -0,0 +1,111 @@
+/* Test for the recip_sqrt tuning
+   ensuring functionality and sufficient accuracy.  */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+         according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+         according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+  return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+  return (a == b || \
+   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+  TYPE r = rsqrt_##TYPE (a); \
+  if (!equals_##TYPE (r, result)) \
+  { \
+    abort (); \
+  } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+  double nan = __builtin_nan ("");
+  double inf = __builtin_inf ();
+  float nanf = __builtin_nanf ("");
+  float inff = __builtin_inff ();
+
+  t_double (1.0/256, 0X1.00000000000000P+4);
+  t_double (1.0, 0X1.00000000000000P+0);
+  t_double (-1.0, nan);
+  t_double (11.0, 0X1.34BF63D1568260P-2);
+  t_double (0.0,  inf);
+  t_double (inf, 0X0.00000000000000P+0);
+  t_double (nan, nan);
+  t_double (-nan, -nan);
+  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+  t_double (PI, 0X1.20DD750429B6D0P-1);
+  t_double (PI_4, 0X1.20DD750429B6D0P+0);
+  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+  t_double (-PI, nan);
+  t_double (-SQRT2, nan);
+  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+  t_float (1.0/256, 0X1.00000000000000P+4);
+  t_float (1.0, 0X1.00000000000000P+0);
+  t_float (-1.0, nanf);
+  t_float (11.0, 0X1.34BF6400000000P-2);
+  t_float (0.0,  inff);
+  t_float (inff, 0X0.00000000000000P+0);
+  t_float (nanf, nanf);
+  t_float (-nanf, -nanf);
+  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+  t_float (PI, 0X1.20DD7400000000P-1);
+  t_float (PI_4, 0X1.20DD7400000000P+0);
+  t_float (SQRT2, 0X1.AE89FA00000000P-1);
+  t_float (SQRT1_2, 0X1.306FE000000000P+0);
+  t_float (-PI, nanf);
+  t_float (-SQRT2, nanf);
+  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+/*   With -ffast-math these return positive INF.  */
+/*   t_double (-0.0, -inf);  */
+/*   t_float (-0.0, -inff);  */
+
+/*   The reason here is that -ffast-math flushes to zero.  */
+/*   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);  */
+/*   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);  */
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
new file mode 100644
index 0000000..b838ed3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
@@ -0,0 +1,25 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
new file mode 100644
index 0000000..8a851e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
@@ -0,0 +1,42 @@
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+  TYPE c; \
+  TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+  s4_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  o.c = 1.0 / sqrt_##TYPE (i.c); \
+  o.d = 1.0 / sqrt_##TYPE (i.d); \
+  return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+  s2_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+  return 1.0 / sqrt_##TYPE (i); \
+} \
+
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
new file mode 100644
index 0000000..b76cc9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
@@ -0,0 +1,12 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte" 0 } } */
+/* { dg-final { scan-assembler-times "frsqrts" 0 } } */
-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v8][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math
  2015-10-16  9:31         ` Marcus Shawcroft
@ 2015-10-16 13:59           ` Benedikt Huber
  2015-10-16 13:58             ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
  0 siblings, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-16 13:59 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, Benedikt Huber

This eighth revision of the patch:
 * Style improvements.

Ok for check in.


Benedikt Huber (1):
  2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c

-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-16 13:58             ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
@ 2015-10-16 14:31               ` Marcus Shawcroft
  0 siblings, 0 replies; 23+ messages in thread
From: Marcus Shawcroft @ 2015-10-16 14:31 UTC (permalink / raw)
  To: Benedikt Huber
  Cc: gcc-patches, philipp.tomsich, Venkataramanan.Kumar, pinskia,
	e.menezes, Kyrill Tkachov, Richard.Earnshaw, James Greenhalgh

On 16 October 2015 at 14:59, Benedikt Huber
<benedikt.huber@theobroma-systems.com> wrote:

> +  typedef struct
> +  {
> +    tree type_node;
> +    const char *builtin_name;
> +    int function_code;
> +  } builtin_decls_data;

Please address Oleg's comment.

Cheers
/Marcus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>      Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
  2015-10-16 12:41         ` Oleg Endo
@ 2015-10-16 14:36           ` Benedikt Huber
  2015-10-16 15:03             ` Marcus Shawcroft
  0 siblings, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-16 14:36 UTC (permalink / raw)
  To: Oleg Endo
  Cc: gcc-patches, Dr. Philipp Tomsich, Richard.Earnshaw,
	marcus.shawcroft, james.greenhalgh

[-- Attachment #1: Type: text/plain, Size: 1229 bytes --]

I introduced this in revision 7 due to a request from James Greenhalgh.
https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00963.html

> Given that this is all so mechanical, I'd have a preference towards
> refactoring this to loop over some structured data.

Do you mean, that I should get rid of the typedef and leave the struct without it?
Or should I completely drop the struct?

> On 16 Oct 2015, at 14:37, Oleg Endo <oleg.endo@t-online.de> wrote:
> 
> On Thu, 2015-10-15 at 22:03 +0000, Benedikt Huber wrote:
>> 
>> +/* Add builtins for reciprocal square root.  */
>> +
>> +void
>> +aarch64_init_builtin_rsqrt (void)
>> +{
>> +  tree fndecl = NULL;
>> +  tree ftype = NULL;
>> +
>> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
>> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
>> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
>> +
>> +  typedef struct
>> +  {
>> +    tree type_node;
>> +    const char *builtin_name;
>> +    int function_code;
>> +  } builtin_decls_data;
> 
> There is an ongoing effort to remove all the unnecessary typedef struct
> and enum etc stuff.  Please try not to add more of it.
> 
> Cheers,
> Oleg
> 


[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 496 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-16 14:36           ` Benedikt Huber
@ 2015-10-16 15:03             ` Marcus Shawcroft
  2015-10-16 16:36               ` [PATCH v9][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
  2015-10-17  1:14               ` Oleg Endo
  0 siblings, 2 replies; 23+ messages in thread
From: Marcus Shawcroft @ 2015-10-16 15:03 UTC (permalink / raw)
  To: Benedikt Huber
  Cc: Oleg Endo, gcc-patches, Dr. Philipp Tomsich, Richard.Earnshaw,
	James Greenhalgh

On 16 October 2015 at 15:31, Benedikt Huber
<benedikt.huber@theobroma-systems.com> wrote:
> I introduced this in revision 7 due to a request from James Greenhalgh.
> https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00963.html
>
>> Given that this is all so mechanical, I'd have a preference towards
>> refactoring this to loop over some structured data.
>
> Do you mean, that I should get rid of the typedef and leave the struct without it?
> Or should I completely drop the struct?

The use of the struct is fine, we are being discouraged from using
unnecessary typedefs.  Just rewrite it as:

 struct builtin_decls_data
   {
   ...
   };

The references to the typedef'd name don't need to be modified.
Cheers
/Marcus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v9][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math
  2015-10-16 15:03             ` Marcus Shawcroft
@ 2015-10-16 16:36               ` Benedikt Huber
  2015-10-16 16:37                 ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
  2015-10-17  1:14               ` Oleg Endo
  1 sibling, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-16 16:36 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, Benedikt Huber

This ninth revision of the patch:
 * Removes unnecessary typedef.

Ok for check in.


Benedikt Huber (1):
  2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c

-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] 2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>      Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
  2015-10-16 16:36               ` [PATCH v9][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
@ 2015-10-16 16:37                 ` Benedikt Huber
  0 siblings, 0 replies; 23+ messages in thread
From: Benedikt Huber @ 2015-10-16 16:37 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, Benedikt Huber

	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
	frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
	applicable.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
	assembly checks.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
	frsqrts and frsqrte are not emitted.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
	frsqrte are emitted.
	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 30860c4..2abe832 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@
+2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
+	frsqrts.
+	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
+	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
+	applicable.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
+	assembly checks.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
+	frsqrts and frsqrte are not emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
+	frsqrte are emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
+
 2015-10-14  Uros Bizjak  <ubizjak@gmail.com>
 
 	* config/mips/mips.h (MIPS_STACK_ALIGN): Implement using
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 716ed6e..44e841a 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -344,6 +344,11 @@ enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -842,6 +847,46 @@ aarch64_init_crc32_builtins ()
     }
 }
 
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_init_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  struct builtin_decls_data
+  {
+    tree type_node;
+    const char *builtin_name;
+    int function_code;
+  };
+
+  builtin_decls_data bdda[] =
+  {
+    { double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF },
+    { float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF },
+    { V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF },
+    { V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF },
+    { V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF }
+  };
+
+  builtin_decls_data *bdd = bdda;
+  builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+
+  for (; bdd < bdd_end; bdd++)
+  {
+    ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+    fndecl = add_builtin_function (bdd->builtin_name,
+      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+    aarch64_builtin_decls[bdd->function_code] = fndecl;
+  }
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -873,6 +918,7 @@ aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
+  aarch64_init_builtin_rsqrt ();
 }
 
 tree
@@ -1136,6 +1182,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  rtx (*gen) (rtx, rtx);
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+	gen = gen_aarch64_rsqrt_df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+	gen = gen_aarch64_rsqrt_sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+	gen = gen_aarch64_rsqrt_v2df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+	gen = gen_aarch64_rsqrt_v2sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+	gen = gen_aarch64_rsqrt_v4sf2;
+	break;
+      default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  emit_insn (gen (target, op0));
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1183,6 +1267,13 @@ aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1340,6 +1431,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..00775db 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
@@ -383,6 +385,8 @@ rtx aarch64_expand_builtin (tree exp,
 			    int ignore ATTRIBUTE_UNUSED);
 tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
 
+tree aarch64_builtin_rsqrt (unsigned int, bool);
+
 tree
 aarch64_builtin_vectorized_function (tree fndecl,
 				     tree type_out,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 167277e..8c359cb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -354,6 +354,33 @@
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "aarch64_rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "aarch64_rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+	       (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "aarch64_rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 628386b..6f7dbce 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5130e37..387d744 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7005,6 +7006,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+			    bool md_fn,
+			    bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || ! (aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrte_df2;
+    case SFmode:   return gen_aarch64_rsqrte_sf2;
+    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrts_df3;
+    case SFmode:   return gen_aarch64_rsqrts_sf3;
+    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode
+	|| mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = double_mode ? 3 : 2;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -13343,6 +13443,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 208f58f..48421d8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a1ce58d..e5691be 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 mpc-relative-literal-loads
 Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+When calculating a sqrt approximation, run fewer steps.
+This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 54e9f12..15e5a6d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -520,6 +520,7 @@ Objective-C and Objective-C++ Dialects}.
 -mtls-size=@var{size} @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12482,6 +12483,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the target processor.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
new file mode 100644
index 0000000..d235be8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
@@ -0,0 +1,111 @@
+/* Test for the recip_sqrt tuning
+   ensuring functionality and sufficient accuracy.  */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+         according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+         according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+  return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+  return (a == b || \
+   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+  TYPE r = rsqrt_##TYPE (a); \
+  if (!equals_##TYPE (r, result)) \
+  { \
+    abort (); \
+  } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+  double nan = __builtin_nan ("");
+  double inf = __builtin_inf ();
+  float nanf = __builtin_nanf ("");
+  float inff = __builtin_inff ();
+
+  t_double (1.0/256, 0X1.00000000000000P+4);
+  t_double (1.0, 0X1.00000000000000P+0);
+  t_double (-1.0, nan);
+  t_double (11.0, 0X1.34BF63D1568260P-2);
+  t_double (0.0,  inf);
+  t_double (inf, 0X0.00000000000000P+0);
+  t_double (nan, nan);
+  t_double (-nan, -nan);
+  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+  t_double (PI, 0X1.20DD750429B6D0P-1);
+  t_double (PI_4, 0X1.20DD750429B6D0P+0);
+  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+  t_double (-PI, nan);
+  t_double (-SQRT2, nan);
+  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+  t_float (1.0/256, 0X1.00000000000000P+4);
+  t_float (1.0, 0X1.00000000000000P+0);
+  t_float (-1.0, nanf);
+  t_float (11.0, 0X1.34BF6400000000P-2);
+  t_float (0.0,  inff);
+  t_float (inff, 0X0.00000000000000P+0);
+  t_float (nanf, nanf);
+  t_float (-nanf, -nanf);
+  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+  t_float (PI, 0X1.20DD7400000000P-1);
+  t_float (PI_4, 0X1.20DD7400000000P+0);
+  t_float (SQRT2, 0X1.AE89FA00000000P-1);
+  t_float (SQRT1_2, 0X1.306FE000000000P+0);
+  t_float (-PI, nanf);
+  t_float (-SQRT2, nanf);
+  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+/*   With -ffast-math these return positive INF.  */
+/*   t_double (-0.0, -inf);  */
+/*   t_float (-0.0, -inff);  */
+
+/*   The reason here is that -ffast-math flushes to zero.  */
+/*   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);  */
+/*   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);  */
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
new file mode 100644
index 0000000..b838ed3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
@@ -0,0 +1,25 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
new file mode 100644
index 0000000..8a851e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
@@ -0,0 +1,42 @@
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+  TYPE c; \
+  TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+  s4_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  o.c = 1.0 / sqrt_##TYPE (i.c); \
+  o.d = 1.0 / sqrt_##TYPE (i.d); \
+  return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+  s2_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+  return 1.0 / sqrt_##TYPE (i); \
+} \
+
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
new file mode 100644
index 0000000..b76cc9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
@@ -0,0 +1,12 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte" 0 } } */
+/* { dg-final { scan-assembler-times "frsqrts" 0 } } */
-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-16 15:03             ` Marcus Shawcroft
  2015-10-16 16:36               ` [PATCH v9][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
@ 2015-10-17  1:14               ` Oleg Endo
  2015-10-19 14:18                 ` [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
  1 sibling, 1 reply; 23+ messages in thread
From: Oleg Endo @ 2015-10-17  1:14 UTC (permalink / raw)
  To: Marcus Shawcroft
  Cc: Benedikt Huber, gcc-patches, Dr. Philipp Tomsich,
	Richard.Earnshaw, James Greenhalgh

On Fri, 2015-10-16 at 15:47 +0100, Marcus Shawcroft wrote:
> On 16 October 2015 at 15:31, Benedikt Huber
> <benedikt.huber@theobroma-systems.com> wrote:
> > I introduced this in revision 7 due to a request from James Greenhalgh.
> > https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00963.html
> >
> >> Given that this is all so mechanical, I'd have a preference towards
> >> refactoring this to loop over some structured data.
> >
> > Do you mean, that I should get rid of the typedef and leave the struct without it?
> > Or should I completely drop the struct?
> 
> The use of the struct is fine, we are being discouraged from using
> unnecessary typedefs.  Just rewrite it as:
> 
>  struct builtin_decls_data
>    {
>    ...
>    };
> 

Yes, that's what I meant.  Also things like 

> +void
> +aarch64_emit_swrsqrt (rtx dst, rtx src)
> +{
> +  enum machine_mode mode = GET_MODE (src);
> +  gcc_assert (
> +    mode == SFmode || mode == V2SFmode || mode == V4SFmode
> +       || mode == DFmode || mode == V2DFmode);

Instead of "enum machine_mode" use "machine_mode".  Similarly, for 
"struct bleh*" use "bleh*".  Although it seems there are no occurrences
of the latter in your patch.

Cheers,
Oleg

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math
  2015-10-17  1:14               ` Oleg Endo
@ 2015-10-19 14:18                 ` Benedikt Huber
  2015-10-19 14:29                   ` [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
  2015-10-19 14:31                   ` [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Bernd Schmidt
  0 siblings, 2 replies; 23+ messages in thread
From: Benedikt Huber @ 2015-10-19 14:18 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, oleg.endo, Benedikt Huber

This tenth revision of the patch:
 * Removes unnecessary enum.

Ok for check in.


Benedikt Huber (1):
  2015-10-19  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c

-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] 2015-10-19  Benedikt Huber  <benedikt.huber@theobroma-systems.com>      Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
  2015-10-19 14:18                 ` [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
@ 2015-10-19 14:29                   ` Benedikt Huber
  2015-10-20 13:40                     ` Marcus Shawcroft
  2015-10-19 14:31                   ` [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Bernd Schmidt
  1 sibling, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-19 14:29 UTC (permalink / raw)
  To: gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, oleg.endo, Benedikt Huber

	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
	frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
	applicable.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
	assembly checks.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
	frsqrts and frsqrte are not emitted.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
	frsqrte are emitted.
	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f39753d..596c9c3 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@
+2015-10-19  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
+	frsqrts.
+	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
+	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
+	applicable.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
+	assembly checks.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
+	frsqrts and frsqrte are not emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
+	frsqrte are emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
+
 2015-10-16  Trevor Saunders  <tbsaunde+gcc@tbsaunde.org>
 
 	* lra-constraints.c (add_next_usage_insn): Change argument type
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index a1998ed..6b4208f 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -324,6 +324,11 @@ enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -822,6 +827,46 @@ aarch64_init_crc32_builtins ()
     }
 }
 
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_init_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  struct builtin_decls_data
+  {
+    tree type_node;
+    const char *builtin_name;
+    int function_code;
+  };
+
+  builtin_decls_data bdda[] =
+  {
+    { double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF },
+    { float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF },
+    { V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF },
+    { V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF },
+    { V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF }
+  };
+
+  builtin_decls_data *bdd = bdda;
+  builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+
+  for (; bdd < bdd_end; bdd++)
+  {
+    ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+    fndecl = add_builtin_function (bdd->builtin_name,
+      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+    aarch64_builtin_decls[bdd->function_code] = fndecl;
+  }
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -853,6 +898,7 @@ aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
+  aarch64_init_builtin_rsqrt ();
 }
 
 tree
@@ -1116,6 +1162,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  rtx (*gen) (rtx, rtx);
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+	gen = gen_aarch64_rsqrt_df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+	gen = gen_aarch64_rsqrt_sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+	gen = gen_aarch64_rsqrt_v2df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+	gen = gen_aarch64_rsqrt_v2sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+	gen = gen_aarch64_rsqrt_v4sf2;
+	break;
+      default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  emit_insn (gen (target, op0));
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1163,6 +1247,13 @@ aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1320,6 +1411,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..00775db 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
@@ -383,6 +385,8 @@ rtx aarch64_expand_builtin (tree exp,
 			    int ignore ATTRIBUTE_UNUSED);
 tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
 
+tree aarch64_builtin_rsqrt (unsigned int, bool);
+
 tree
 aarch64_builtin_vectorized_function (tree fndecl,
 				     tree type_out,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 167277e..8c359cb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -354,6 +354,33 @@
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "aarch64_rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "aarch64_rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+	       (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "aarch64_rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 628386b..6f7dbce 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index aba5b56..a7904df 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -392,7 +392,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -456,7 +457,7 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -6989,6 +6990,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+			    bool md_fn,
+			    bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || ! (aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrte_df2;
+    case SFmode:   return gen_aarch64_rsqrte_sf2;
+    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrts_df3;
+    case SFmode:   return gen_aarch64_rsqrts_sf3;
+    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode
+	|| mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = double_mode ? 3 : 2;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -13327,6 +13427,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 208f58f..48421d8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a1ce58d..e5691be 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 mpc-relative-literal-loads
 Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+When calculating a sqrt approximation, run fewer steps.
+This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 54e9f12..15e5a6d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -520,6 +520,7 @@ Objective-C and Objective-C++ Dialects}.
 -mtls-size=@var{size} @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12482,6 +12483,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the target processor.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
new file mode 100644
index 0000000..d235be8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
@@ -0,0 +1,111 @@
+/* Test for the recip_sqrt tuning
+   ensuring functionality and sufficient accuracy.  */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+         according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+         according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+  return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+  return (a == b || \
+   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+  TYPE r = rsqrt_##TYPE (a); \
+  if (!equals_##TYPE (r, result)) \
+  { \
+    abort (); \
+  } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+  double nan = __builtin_nan ("");
+  double inf = __builtin_inf ();
+  float nanf = __builtin_nanf ("");
+  float inff = __builtin_inff ();
+
+  t_double (1.0/256, 0X1.00000000000000P+4);
+  t_double (1.0, 0X1.00000000000000P+0);
+  t_double (-1.0, nan);
+  t_double (11.0, 0X1.34BF63D1568260P-2);
+  t_double (0.0,  inf);
+  t_double (inf, 0X0.00000000000000P+0);
+  t_double (nan, nan);
+  t_double (-nan, -nan);
+  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+  t_double (PI, 0X1.20DD750429B6D0P-1);
+  t_double (PI_4, 0X1.20DD750429B6D0P+0);
+  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+  t_double (-PI, nan);
+  t_double (-SQRT2, nan);
+  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+  t_float (1.0/256, 0X1.00000000000000P+4);
+  t_float (1.0, 0X1.00000000000000P+0);
+  t_float (-1.0, nanf);
+  t_float (11.0, 0X1.34BF6400000000P-2);
+  t_float (0.0,  inff);
+  t_float (inff, 0X0.00000000000000P+0);
+  t_float (nanf, nanf);
+  t_float (-nanf, -nanf);
+  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+  t_float (PI, 0X1.20DD7400000000P-1);
+  t_float (PI_4, 0X1.20DD7400000000P+0);
+  t_float (SQRT2, 0X1.AE89FA00000000P-1);
+  t_float (SQRT1_2, 0X1.306FE000000000P+0);
+  t_float (-PI, nanf);
+  t_float (-SQRT2, nanf);
+  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+/*   With -ffast-math these return positive INF.  */
+/*   t_double (-0.0, -inf);  */
+/*   t_float (-0.0, -inff);  */
+
+/*   The reason here is that -ffast-math flushes to zero.  */
+/*   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);  */
+/*   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);  */
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
new file mode 100644
index 0000000..b838ed3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
@@ -0,0 +1,25 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
new file mode 100644
index 0000000..8a851e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
@@ -0,0 +1,42 @@
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+  TYPE c; \
+  TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+  s4_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  o.c = 1.0 / sqrt_##TYPE (i.c); \
+  o.d = 1.0 / sqrt_##TYPE (i.d); \
+  return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+  s2_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+  return 1.0 / sqrt_##TYPE (i); \
+} \
+
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
new file mode 100644
index 0000000..b76cc9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
@@ -0,0 +1,12 @@
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte" 0 } } */
+/* { dg-final { scan-assembler-times "frsqrts" 0 } } */
-- 
1.9.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math
  2015-10-19 14:18                 ` [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
  2015-10-19 14:29                   ` [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
@ 2015-10-19 14:31                   ` Bernd Schmidt
  1 sibling, 0 replies; 23+ messages in thread
From: Bernd Schmidt @ 2015-10-19 14:31 UTC (permalink / raw)
  To: Benedikt Huber, gcc-patches
  Cc: philipp.tomsich, Venkataramanan.Kumar, pinskia, e.menezes,
	kyrylo.tkachov, Richard.Earnshaw, marcus.shawcroft,
	james.greenhalgh, oleg.endo

On 01/04/1970 01:02 AM, Benedikt Huber wrote:
> This tenth revision of the patch:
>   * Removes unnecessary enum.

Please fix your clock.


Bernd

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-19 14:29                   ` [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
@ 2015-10-20 13:40                     ` Marcus Shawcroft
  2015-10-22 10:13                       ` Benedikt Huber
  0 siblings, 1 reply; 23+ messages in thread
From: Marcus Shawcroft @ 2015-10-20 13:40 UTC (permalink / raw)
  To: Benedikt Huber
  Cc: gcc-patches, philipp.tomsich, Venkataramanan.Kumar, pinskia,
	e.menezes, Kyrill Tkachov, Richard.Earnshaw, James Greenhalgh,
	oleg.endo

On 4 January 1970 at 00:02, Benedikt Huber
<benedikt.huber@theobroma-systems.com> wrote:
>         * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
>         * config/aarch64/aarch64-protos.h: Declare.
>         * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
>         frsqrts.
>         * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
>         * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
>         applicable.
>         * config/aarch64/aarch64.md: Added enum entries.
>         * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
>         * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
>         assembly checks.
>         * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
>         frsqrts and frsqrte are not emitted.
>         * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
>         frsqrte are emitted.
>         * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.

OK /Marcus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-20 13:40                     ` Marcus Shawcroft
@ 2015-10-22 10:13                       ` Benedikt Huber
  2015-10-23 11:07                         ` Bernd Schmidt
  0 siblings, 1 reply; 23+ messages in thread
From: Benedikt Huber @ 2015-10-22 10:13 UTC (permalink / raw)
  To: Marcus Shawcroft; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1542 bytes --]

https://gcc.gnu.org/contribute.html states

"If you do not have write access and a patch of yours has been approved, but not committed, please advise the approver of that fact. You may want to point out lack of write access in your initial submission, too.”

Should I apply for svn write access? And if so, who would approve that?

> On 20 Oct 2015, at 15:34, Marcus Shawcroft <marcus.shawcroft@gmail.com> wrote:
> 
> On 4 January 1970 at 00:02, Benedikt Huber
> <benedikt.huber@theobroma-systems.com> wrote:
>>        * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
>>        * config/aarch64/aarch64-protos.h: Declare.
>>        * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
>>        frsqrts.
>>        * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
>>        * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
>>        applicable.
>>        * config/aarch64/aarch64.md: Added enum entries.
>>        * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
>>        * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
>>        assembly checks.
>>        * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
>>        frsqrts and frsqrte are not emitted.
>>        * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
>>        frsqrte are emitted.
>>        * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
> 
> OK /Marcus


[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 496 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-22 10:13                       ` Benedikt Huber
@ 2015-10-23 11:07                         ` Bernd Schmidt
  2015-10-23 11:13                           ` Benedikt Huber
  0 siblings, 1 reply; 23+ messages in thread
From: Bernd Schmidt @ 2015-10-23 11:07 UTC (permalink / raw)
  To: Benedikt Huber, Marcus Shawcroft; +Cc: gcc-patches

On 10/22/2015 12:13 PM, Benedikt Huber wrote:
> https://gcc.gnu.org/contribute.html states
>
> "If you do not have write access and a patch of yours has been approved, but not committed, please advise the approver of that fact. You may want to point out lack of write access in your initial submission, too.”
>
> Should I apply for svn write access? And if so, who would approve that?

I guess the first question would be, have you done the copyright assignment?


Bernd

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  2015-10-23 11:07                         ` Bernd Schmidt
@ 2015-10-23 11:13                           ` Benedikt Huber
  0 siblings, 0 replies; 23+ messages in thread
From: Benedikt Huber @ 2015-10-23 11:13 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Marcus Shawcroft, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 114 bytes --]


> I guess the first question would be, have you done the copyright assignment?
Yes, I have already done that.

[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 496 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2015-10-23 11:08 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-03 13:15 [PATCH v6][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
2015-10-03 13:15 ` [PATCH] 2015-10-02 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
2015-10-09 11:45   ` James Greenhalgh
2015-10-15 22:04     ` [PATCH v7][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
2015-10-15 22:04       ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
2015-10-16  9:31         ` Marcus Shawcroft
2015-10-16 13:59           ` [PATCH v8][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
2015-10-16 13:58             ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
2015-10-16 14:31               ` Marcus Shawcroft
2015-10-16 12:41         ` Oleg Endo
2015-10-16 14:36           ` Benedikt Huber
2015-10-16 15:03             ` Marcus Shawcroft
2015-10-16 16:36               ` [PATCH v9][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
2015-10-16 16:37                 ` [PATCH] 2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
2015-10-17  1:14               ` Oleg Endo
2015-10-19 14:18                 ` [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Benedikt Huber
2015-10-19 14:29                   ` [PATCH] 2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Benedikt Huber
2015-10-20 13:40                     ` Marcus Shawcroft
2015-10-22 10:13                       ` Benedikt Huber
2015-10-23 11:07                         ` Bernd Schmidt
2015-10-23 11:13                           ` Benedikt Huber
2015-10-19 14:31                   ` [PATCH v10][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math Bernd Schmidt
2015-10-08 18:45 ` [PATCH v6][aarch64] " Evandro Menezes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).