public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [committed] amdgcn: HardFP divide
@ 2023-04-18 11:24 Andrew Stubbs
  0 siblings, 0 replies; only message in thread
From: Andrew Stubbs @ 2023-04-18 11:24 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 333 bytes --]

This patch switches amdgcn from using softfp for division to using the 
hardware support. There isn't a single division instruction, but there 
is an instruction sequence that gives the necessary accuracy.

This implementation also allows fully vectorized division, so gives good 
performance improvements for many use cases.

Andrew

[-- Attachment #2: 230418-hardfp-divide.patch --]
[-- Type: text/plain, Size: 13592 bytes --]

amdgcn: HardFP divide

Implement FP division using hardware instructions. This replaces both the
softfp library calls, and the --fast-math inaccurate divsion we had previously.

The GCN architecture does not have a single divide instruction, but it does
have a number of support instructions designed to make multiply-by-reciprocal
sufficiently accurate for non-fast-math usage.

gcc/ChangeLog:

	* config/gcn/gcn-valu.md (SV_SFDF): New iterator.
	(SV_FP): New iterator.
	(scalar_mode, SCALAR_MODE): Add identity mappings for scalar modes.
	(recip<mode>2): Unify the two patterns using SV_FP.
	(div_scale<mode><exec_vcc>): New insn.
	(div_fmas<mode><exec>): New insn.
	(div_fixup<mode><exec>): New insn.
	(div<mode>3): Unify the two expanders and rewrite using hardfp.
	* config/gcn/gcn.cc (gcn_md_reorg): Support "vccwait" attribute.
	* config/gcn/gcn.md (unspec): Add UNSPEC_DIV_SCALE, UNSPEC_DIV_FMAS,
	and UNSPEC_DIV_FIXUP.
	(vccwait): New attribute.

gcc/testsuite/ChangeLog:

	* gcc.target/gcn/fpdiv.c: Remove the -ffast-math requirement.

diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index e3d6d65e9a9..4a40a9d8d4c 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -15,6 +15,7 @@
 ;; <http://www.gnu.org/licenses/>.
 
 ;; {{{ Vector iterators
+; SV iterators include both scalar and vector modes.
 
 ; Vector modes for specific types
 (define_mode_iterator V_QI
@@ -126,6 +127,15 @@ (define_mode_iterator V_INT_noHI
 		       V32SI V32DI
 		       V64SI V64DI])
 
+(define_mode_iterator SV_SFDF
+		      [SF DF
+		       V2SF V2DF
+		       V4SF V4DF
+		       V8SF V8DF
+		       V16SF V16DF
+		       V32SF V32DF
+		       V64SF V64DF])
+
 ; All of above
 (define_mode_iterator V_ALL
 		      [V2QI V2HI V2HF V2SI V2SF V2DI V2DF
@@ -156,9 +166,19 @@ (define_mode_iterator V_FP
 		       V16HF V16SF V16DF
 		       V32HF V32SF V32DF
 		       V64HF V64SF V64DF])
+(define_mode_iterator SV_FP
+		      [HF SF DF
+		       V2HF V2SF V2DF
+		       V4HF V4SF V4DF
+		       V8HF V8SF V8DF
+		       V16HF V16SF V16DF
+		       V32HF V32SF V32DF
+		       V64HF V64SF V64DF])
 
 (define_mode_attr scalar_mode
-  [(V2QI "qi") (V2HI "hi") (V2SI "si")
+  [(QI "qi") (HI "hi") (SI "si")
+   (HF "hf") (SF "sf") (DI "di") (DF "df")
+   (V2QI "qi") (V2HI "hi") (V2SI "si")
    (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
    (V4QI "qi") (V4HI "hi") (V4SI "si")
    (V4HF "hf") (V4SF "sf") (V4DI "di") (V4DF "df")
@@ -172,7 +192,9 @@ (define_mode_attr scalar_mode
    (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
 
 (define_mode_attr SCALAR_MODE
-  [(V2QI "QI") (V2HI "HI") (V2SI "SI")
+  [(QI "QI") (HI "HI") (SI "SI")
+   (HF "HF") (SF "SF") (DI "DI") (DF "DF")
+   (V2QI "QI") (V2HI "HI") (V2SI "SI")
    (V2HF "HF") (V2SF "SF") (V2DI "DI") (V2DF "DF")
    (V4QI "QI") (V4HI "HI") (V4SI "SI")
    (V4HF "HF") (V4SF "SF") (V4DI "DI") (V4DF "DF")
@@ -3188,113 +3210,124 @@ (define_insn "fms<mode>4_negop2"
 ;; {{{ FP division
 
 (define_insn "recip<mode>2<exec>"
-  [(set (match_operand:V_FP 0 "register_operand"  "=  v")
-	(unspec:V_FP
-	  [(match_operand:V_FP 1 "gcn_alu_operand" "vSvB")]
+  [(set (match_operand:SV_FP 0 "register_operand"  "=  v")
+	(unspec:SV_FP
+	  [(match_operand:SV_FP 1 "gcn_alu_operand" "vSvB")]
 	  UNSPEC_RCP))]
   ""
   "v_rcp%i0\t%0, %1"
   [(set_attr "type" "vop1")
    (set_attr "length" "8")])
 
-(define_insn "recip<mode>2"
-  [(set (match_operand:FP 0 "register_operand"	 "=  v")
-	(unspec:FP
-	  [(match_operand:FP 1 "gcn_alu_operand" "vSvB")]
-	  UNSPEC_RCP))]
-  ""
-  "v_rcp%i0\t%0, %1"
-  [(set_attr "type" "vop1")
+;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the
+;; one that matches op3 adjusted for best results in reciprocal division.
+;; It also emits a VCC mask that is intended for input to v_div_fmas.
+;; The caller is expected to call this twice, once for each input.  The output
+;; VCC is the same in both cases, so the caller may discard one.
+(define_insn "div_scale<mode><exec_vcc>"
+  [(set (match_operand:SV_SFDF 0 "register_operand"   "=v")
+	(unspec:SV_SFDF
+	  [(match_operand:SV_SFDF 1 "gcn_alu_operand" "v")
+	   (match_operand:SV_SFDF 2 "gcn_alu_operand" "v")
+	   (match_operand:SV_SFDF 3 "gcn_alu_operand" "v")]
+	  UNSPEC_DIV_SCALE))
+   (set (match_operand:DI 4 "register_operand"        "=SvcV")
+	(unspec:DI
+	  [(match_dup 1) (match_dup 2) (match_dup 3)]
+	  UNSPEC_DIV_SCALE))]
+  ""
+  "v_div_scale%i0\t%0, %4, %3, %1, %2"
+  [(set_attr "type" "vop3b")
    (set_attr "length" "8")])
 
-;; Do division via a = b * 1/c
-;; The v_rcp_* instructions are not sufficiently accurate on their own,
-;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson
-;; which the ISA manual says is enough to improve the reciprocal accuracy.
-;;
-;; FIXME: This does not handle denormals, NaNs, division-by-zero etc.
+;; v_div_fmas is "FMA and Scale" that uses the VCC output from v_div_scale
+;; to conditionally scale the output of the whole division operation.
+;; This is necessary to counter the adjustments made by v_div_scale and
+;; replaces the last FMA instruction of the Newton Raphson algorithm.
+(define_insn "div_fmas<mode><exec>"
+  [(set (match_operand:SV_SFDF 0 "register_operand"       "=v")
+	(unspec:SV_SFDF
+	  [(plus:SV_SFDF
+	     (mult:SV_SFDF
+	       (match_operand:SV_SFDF 1 "gcn_alu_operand" "v")
+	       (match_operand:SV_SFDF 2 "gcn_alu_operand" "v"))
+	     (match_operand:SV_SFDF 3 "gcn_alu_operand"   "v"))
+	   (match_operand:DI 4 "register_operand"         "cV")]
+	  UNSPEC_DIV_FMAS))]
+  ""
+  "v_div_fmas%i0\t%0, %1, %2, %3; %4"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "vccwait" "5")])
+
+;; v_div_fixup takes the inputs and outputs of a division operation already
+;; completed and cleans up the floating-point sign bit, infinity, underflow,
+;; overflow, and NaN status.  It will also emit any FP exceptions.
+;; op1: quotient,  op2: denominator,  op3: numerator
+(define_insn "div_fixup<mode><exec>"
+  [(set (match_operand:SV_FP 0 "register_operand"    "=v")
+	(unspec:SV_FP
+	  [(match_operand:SV_FP 1 "register_operand" "v")
+	   (match_operand:SV_FP 2 "gcn_alu_operand"  "v")
+	   (match_operand:SV_FP 3 "gcn_alu_operand"  "v")]
+	  UNSPEC_DIV_FIXUP))]
+  ""
+  "v_div_fixup%i0\t%0, %1, %2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
 
 (define_expand "div<mode>3"
-  [(match_operand:V_FP 0 "gcn_valu_dst_operand")
-   (match_operand:V_FP 1 "gcn_valu_src0_operand")
-   (match_operand:V_FP 2 "gcn_valu_src0_operand")]
-  "flag_reciprocal_math"
+  [(match_operand:SV_SFDF 0 "register_operand")
+   (match_operand:SV_SFDF 1 "gcn_alu_operand")
+   (match_operand:SV_SFDF 2 "gcn_alu_operand")]
+  ""
   {
+    rtx numerator = operands[1];
+    rtx denominator = operands[2];
+
+    /* Scale the inputs if they are close to the FP limits.
+       This will be reversed later.  */
+    rtx vcc = gen_reg_rtx (DImode);
+    rtx discardedvcc = gen_reg_rtx (DImode);
+    rtx scaled_numerator = gen_reg_rtx (<MODE>mode);
+    rtx scaled_denominator = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_div_scale<mode> (scaled_denominator,
+				    denominator, numerator,
+				    denominator, discardedvcc));
+    emit_insn (gen_div_scale<mode> (scaled_numerator,
+				    denominator, numerator,
+				    numerator, vcc));
+
+    /* Find the reciprocal of the denominator, and use Newton-Raphson to
+       improve the accuracy over the basic hardware instruction.  */
     rtx one = gcn_vec_constant (<MODE>mode,
 		  const_double_from_real_value (dconst1, <SCALAR_MODE>mode));
     rtx initrcp = gen_reg_rtx (<MODE>mode);
-    rtx fma = gen_reg_rtx (<MODE>mode);
-    rtx rcp;
-    rtx num = operands[1], denom = operands[2];
-
-    bool is_rcp = (GET_CODE (num) == CONST_VECTOR
-		   && real_identical
-		        (CONST_DOUBLE_REAL_VALUE
-			  (CONST_VECTOR_ELT (num, 0)), &dconstm1));
-
-    if (is_rcp)
-      rcp = operands[0];
-    else
-      rcp = gen_reg_rtx (<MODE>mode);
-
-    emit_insn (gen_recip<mode>2 (initrcp, denom));
-    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, denom, one));
-    emit_insn (gen_fma<mode>4 (rcp, fma, initrcp, initrcp));
-
-    if (!is_rcp)
-      {
-	rtx div_est = gen_reg_rtx (<MODE>mode);
-	rtx fma2 = gen_reg_rtx (<MODE>mode);
-	rtx fma3 = gen_reg_rtx (<MODE>mode);
-	rtx fma4 = gen_reg_rtx (<MODE>mode);
-	emit_insn (gen_mul<mode>3 (div_est, num, rcp));
-	emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, denom, num));
-	emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
-	emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, denom, num));
-	emit_insn (gen_fma<mode>4 (operands[0], fma4, rcp, fma3));
-      }
-
-    DONE;
-  })
-
-(define_expand "div<mode>3"
-  [(match_operand:FP 0 "gcn_valu_dst_operand")
-   (match_operand:FP 1 "gcn_valu_src0_operand")
-   (match_operand:FP 2 "gcn_valu_src0_operand")]
-  "flag_reciprocal_math"
-  {
-    rtx one = const_double_from_real_value (dconst1, <MODE>mode);
-    rtx initrcp = gen_reg_rtx (<MODE>mode);
-    rtx fma = gen_reg_rtx (<MODE>mode);
-    rtx rcp;
-    rtx num = operands[1], denom = operands[2];
-
-    bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE
-		   && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]),
-				      &dconstm1));
-
-    if (is_rcp)
-      rcp = operands[0];
-    else
-      rcp = gen_reg_rtx (<MODE>mode);
-
-    emit_insn (gen_recip<mode>2 (initrcp, denom));
-    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, denom, one));
-    emit_insn (gen_fma<mode>4 (rcp, fma, initrcp, initrcp));
-
-    if (!is_rcp)
-      {
-	rtx div_est = gen_reg_rtx (<MODE>mode);
-	rtx fma2 = gen_reg_rtx (<MODE>mode);
-	rtx fma3 = gen_reg_rtx (<MODE>mode);
-	rtx fma4 = gen_reg_rtx (<MODE>mode);
-	emit_insn (gen_mul<mode>3 (div_est, num, rcp));
-	emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, denom, num));
-	emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
-	emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, denom, num));
-	emit_insn (gen_fma<mode>4 (operands[0], fma4, rcp, fma3));
-      }
-
+    rtx fma1 = gen_reg_rtx (<MODE>mode);
+    rtx rcp = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_recip<mode>2 (initrcp, scaled_denominator));
+    emit_insn (gen_fma<mode>4_negop2 (fma1, initrcp, scaled_denominator, one));
+    emit_insn (gen_fma<mode>4 (rcp, fma1, initrcp, initrcp));
+
+    /* Do the division "a/b" via "a*1/b" and use Newton-Raphson to improve
+       the accuracy.  The "div_fmas" instruction reverses any scaling
+       performed by "div_scale", above.  */
+    rtx div_est = gen_reg_rtx (<MODE>mode);
+    rtx fma2 = gen_reg_rtx (<MODE>mode);
+    rtx fma3 = gen_reg_rtx (<MODE>mode);
+    rtx fma4 = gen_reg_rtx (<MODE>mode);
+    rtx fmas = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_mul<mode>3 (div_est, scaled_numerator, rcp));
+    emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, scaled_denominator,
+				      scaled_numerator));
+    emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
+    emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, scaled_denominator,
+				      scaled_numerator));
+    emit_insn (gen_div_fmas<mode> (fmas, fma4, rcp, fma3, vcc));
+
+    /* Finally, use "div_fixup" to get the details right and find errors.  */
+    emit_insn (gen_div_fixup<mode> (operands[0], fmas, denominator,
+				    numerator));
     DONE;
   })
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index a7d278cd2f8..5608d85a1a0 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5840,6 +5840,7 @@ gcn_md_reorg (void)
       attr_type itype = get_attr_type (insn);
       attr_unit iunit = get_attr_unit (insn);
       attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
+      int ivccwait = get_attr_vccwait (insn);
       HARD_REG_SET ireads, iwrites;
       CLEAR_HARD_REG_SET (ireads);
       CLEAR_HARD_REG_SET (iwrites);
@@ -5917,6 +5918,14 @@ gcn_md_reorg (void)
 	      && ((hard_reg_set_intersect_p
 		   (prev_insn->reads, iwrites))))
 	    nops_rqd = 1 - prev_insn->age;
+
+	  /* Instruction that requires VCC is not written too close before
+	     using it.  */
+	  if (prev_insn->age < ivccwait
+	      && (hard_reg_set_intersect_p
+		  (prev_insn->writes,
+		   reg_class_contents[(int)VCC_CONDITIONAL_REG])))
+	    nops_rqd = ivccwait - prev_insn->age;
 	}
 
       /* Insert the required number of NOPs.  */
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index c90303c54b5..7065acf402b 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -90,7 +90,8 @@ (define_c_enum "unspec" [
   UNSPEC_RCP
   UNSPEC_FLBIT_INT
   UNSPEC_FLOOR UNSPEC_CEIL UNSPEC_SIN UNSPEC_COS UNSPEC_EXP2 UNSPEC_LOG2
-  UNSPEC_LDEXP UNSPEC_FREXP_EXP UNSPEC_FREXP_MANT])
+  UNSPEC_LDEXP UNSPEC_FREXP_EXP UNSPEC_FREXP_MANT
+  UNSPEC_DIV_SCALE UNSPEC_DIV_FMAS UNSPEC_DIV_FIXUP])
 
 ;; }}}
 ;; {{{ Attributes
@@ -302,6 +303,11 @@ (define_attr "laneselect" "yes,no" (const_string "no"))
 
 (define_attr "delayeduse" "yes,no" (const_string "no"))
 
+; Identify instructions that require "Manually Inserted Wait State" if
+; a previous instruction writes to VCC.  The number gives the number of NOPs.
+
+(define_attr "vccwait" "" (const_int 0))
+
 ;; }}}
 ;; {{{ Iterators useful across the wole machine description
 
diff --git a/gcc/testsuite/gcc.target/gcn/fpdiv.c b/gcc/testsuite/gcc.target/gcn/fpdiv.c
index 7125b6f6ba0..936d39cf98e 100644
--- a/gcc/testsuite/gcc.target/gcn/fpdiv.c
+++ b/gcc/testsuite/gcc.target/gcn/fpdiv.c
@@ -1,5 +1,4 @@
 /* { dg-do run } */
-/* { dg-options "-ffast-math" } */
 
 #include <stdio.h>
 #include <stdlib.h>

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-04-18 11:24 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-18 11:24 [committed] amdgcn: HardFP divide Andrew Stubbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).