From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <ams@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1461)
	id A89433858C53; Wed, 26 Apr 2023 12:47:49 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A89433858C53
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1682513269;
	bh=OUGvjq5XbMrgKzxnJT6w1KQV17EuCNfkeodVGTrKKMo=;
	h=From:To:Subject:Date:From;
	b=YaW4JCMCMJhDL1wtU9nkieSsK4BHOCOLQFEtJpYQY+smGYKGLKMflaMFiz8RsMlrp
	 FO6L4h7Tgq/H99iIrr3msaXlEYuPuE3N18pKDDrdLEzJa22LypWUudQWdXrI7nZAcA
	 E4CwOGxJVTQI6q4pwGnzat1w28p5g/59DkNtBnlQ=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Andrew Stubbs <ams@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r13-7253] amdgcn: HardFP divide
X-Act-Checkin: gcc
X-Git-Author: Andrew Stubbs <ams@codesourcery.com>
X-Git-Refname: refs/heads/releases/gcc-13
X-Git-Oldrev: df49e4602882eabe0642699fb71a70f6e120e263
X-Git-Newrev: 3411f75ea1cd3d98f77367fc4b0a54871978645d
Message-Id: <20230426124749.A89433858C53@sourceware.org>
Date: Wed, 26 Apr 2023 12:47:49 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:3411f75ea1cd3d98f77367fc4b0a54871978645d

commit r13-7253-g3411f75ea1cd3d98f77367fc4b0a54871978645d
Author: Andrew Stubbs <ams@codesourcery.com>
Date:   Fri Apr 14 17:05:15 2023 +0100

    amdgcn: HardFP divide
    
    Implement FP division using hardware instructions. This replaces both the
    softfp library calls, and the --fast-math inaccurate divsion we had previously.
    
    The GCN architecture does not have a single divide instruction, but it does
    have a number of support instructions designed to make multiply-by-reciprocal
    sufficiently accurate for non-fast-math usage.
    
    gcc/ChangeLog:
    
            * config/gcn/gcn-valu.md (SV_SFDF): New iterator.
            (SV_FP): New iterator.
            (scalar_mode, SCALAR_MODE): Add identity mappings for scalar modes.
            (recip<mode>2): Unify the two patterns using SV_FP.
            (div_scale<mode><exec_vcc>): New insn.
            (div_fmas<mode><exec>): New insn.
            (div_fixup<mode><exec>): New insn.
            (div<mode>3): Unify the two expanders and rewrite using hardfp.
            * config/gcn/gcn.cc (gcn_md_reorg): Support "vccwait" attribute.
            * config/gcn/gcn.md (unspec): Add UNSPEC_DIV_SCALE, UNSPEC_DIV_FMAS,
            and UNSPEC_DIV_FIXUP.
            (vccwait): New attribute.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/gcn/fpdiv.c: Remove the -ffast-math requirement.
    
    (cherry picked from commit cfdc45f73c56ad051a53576a4e88675ced2660d4)

Diff:
---
 gcc/config/gcn/gcn-valu.md           | 223 ++++++++++++++++++++---------------
 gcc/config/gcn/gcn.cc                |   9 ++
 gcc/config/gcn/gcn.md                |   8 +-
 gcc/testsuite/gcc.target/gcn/fpdiv.c |   1 -
 4 files changed, 144 insertions(+), 97 deletions(-)
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index e3d6d65e9a9..4a40a9d8d4c 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -15,6 +15,7 @@
 ;; <http://www.gnu.org/licenses/>.
 
 ;; {{{ Vector iterators
+; SV iterators include both scalar and vector modes.
 
 ; Vector modes for specific types
 (define_mode_iterator V_QI
@@ -126,6 +127,15 @@
 		       V32SI V32DI
 		       V64SI V64DI])
 
+(define_mode_iterator SV_SFDF
+		      [SF DF
+		       V2SF V2DF
+		       V4SF V4DF
+		       V8SF V8DF
+		       V16SF V16DF
+		       V32SF V32DF
+		       V64SF V64DF])
+
 ; All of above
 (define_mode_iterator V_ALL
 		      [V2QI V2HI V2HF V2SI V2SF V2DI V2DF
@@ -156,9 +166,19 @@
 		       V16HF V16SF V16DF
 		       V32HF V32SF V32DF
 		       V64HF V64SF V64DF])
+(define_mode_iterator SV_FP
+		      [HF SF DF
+		       V2HF V2SF V2DF
+		       V4HF V4SF V4DF
+		       V8HF V8SF V8DF
+		       V16HF V16SF V16DF
+		       V32HF V32SF V32DF
+		       V64HF V64SF V64DF])
 
 (define_mode_attr scalar_mode
-  [(V2QI "qi") (V2HI "hi") (V2SI "si")
+  [(QI "qi") (HI "hi") (SI "si")
+   (HF "hf") (SF "sf") (DI "di") (DF "df")
+   (V2QI "qi") (V2HI "hi") (V2SI "si")
    (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
    (V4QI "qi") (V4HI "hi") (V4SI "si")
    (V4HF "hf") (V4SF "sf") (V4DI "di") (V4DF "df")
@@ -172,7 +192,9 @@
    (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
 
 (define_mode_attr SCALAR_MODE
-  [(V2QI "QI") (V2HI "HI") (V2SI "SI")
+  [(QI "QI") (HI "HI") (SI "SI")
+   (HF "HF") (SF "SF") (DI "DI") (DF "DF")
+   (V2QI "QI") (V2HI "HI") (V2SI "SI")
    (V2HF "HF") (V2SF "SF") (V2DI "DI") (V2DF "DF")
    (V4QI "QI") (V4HI "HI") (V4SI "SI")
    (V4HF "HF") (V4SF "SF") (V4DI "DI") (V4DF "DF")
@@ -3188,113 +3210,124 @@
 ;; {{{ FP division
 
 (define_insn "recip<mode>2<exec>"
-  [(set (match_operand:V_FP 0 "register_operand"  "=  v")
-	(unspec:V_FP
-	  [(match_operand:V_FP 1 "gcn_alu_operand" "vSvB")]
+  [(set (match_operand:SV_FP 0 "register_operand"  "=  v")
+	(unspec:SV_FP
+	  [(match_operand:SV_FP 1 "gcn_alu_operand" "vSvB")]
 	  UNSPEC_RCP))]
   ""
   "v_rcp%i0\t%0, %1"
   [(set_attr "type" "vop1")
    (set_attr "length" "8")])
 
-(define_insn "recip<mode>2"
-  [(set (match_operand:FP 0 "register_operand"	 "=  v")
-	(unspec:FP
-	  [(match_operand:FP 1 "gcn_alu_operand" "vSvB")]
-	  UNSPEC_RCP))]
-  ""
-  "v_rcp%i0\t%0, %1"
-  [(set_attr "type" "vop1")
+;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the
+;; one that matches op3 adjusted for best results in reciprocal division.
+;; It also emits a VCC mask that is intended for input to v_div_fmas.
+;; The caller is expected to call this twice, once for each input.  The output
+;; VCC is the same in both cases, so the caller may discard one.
+(define_insn "div_scale<mode><exec_vcc>"
+  [(set (match_operand:SV_SFDF 0 "register_operand"   "=v")
+	(unspec:SV_SFDF
+	  [(match_operand:SV_SFDF 1 "gcn_alu_operand" "v")
+	   (match_operand:SV_SFDF 2 "gcn_alu_operand" "v")
+	   (match_operand:SV_SFDF 3 "gcn_alu_operand" "v")]
+	  UNSPEC_DIV_SCALE))
+   (set (match_operand:DI 4 "register_operand"        "=SvcV")
+	(unspec:DI
+	  [(match_dup 1) (match_dup 2) (match_dup 3)]
+	  UNSPEC_DIV_SCALE))]
+  ""
+  "v_div_scale%i0\t%0, %4, %3, %1, %2"
+  [(set_attr "type" "vop3b")
    (set_attr "length" "8")])
 
-;; Do division via a = b * 1/c
-;; The v_rcp_* instructions are not sufficiently accurate on their own,
-;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson
-;; which the ISA manual says is enough to improve the reciprocal accuracy.
-;;
-;; FIXME: This does not handle denormals, NaNs, division-by-zero etc.
+;; v_div_fmas is "FMA and Scale" that uses the VCC output from v_div_scale
+;; to conditionally scale the output of the whole division operation.
+;; This is necessary to counter the adjustments made by v_div_scale and
+;; replaces the last FMA instruction of the Newton Raphson algorithm.
+(define_insn "div_fmas<mode><exec>"
+  [(set (match_operand:SV_SFDF 0 "register_operand"       "=v")
+	(unspec:SV_SFDF
+	  [(plus:SV_SFDF
+	     (mult:SV_SFDF
+	       (match_operand:SV_SFDF 1 "gcn_alu_operand" "v")
+	       (match_operand:SV_SFDF 2 "gcn_alu_operand" "v"))
+	     (match_operand:SV_SFDF 3 "gcn_alu_operand"   "v"))
+	   (match_operand:DI 4 "register_operand"         "cV")]
+	  UNSPEC_DIV_FMAS))]
+  ""
+  "v_div_fmas%i0\t%0, %1, %2, %3; %4"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "vccwait" "5")])
+
+;; v_div_fixup takes the inputs and outputs of a division operation already
+;; completed and cleans up the floating-point sign bit, infinity, underflow,
+;; overflow, and NaN status.  It will also emit any FP exceptions.
+;; op1: quotient,  op2: denominator,  op3: numerator
+(define_insn "div_fixup<mode><exec>"
+  [(set (match_operand:SV_FP 0 "register_operand"    "=v")
+	(unspec:SV_FP
+	  [(match_operand:SV_FP 1 "register_operand" "v")
+	   (match_operand:SV_FP 2 "gcn_alu_operand"  "v")
+	   (match_operand:SV_FP 3 "gcn_alu_operand"  "v")]
+	  UNSPEC_DIV_FIXUP))]
+  ""
+  "v_div_fixup%i0\t%0, %1, %2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
 
 (define_expand "div<mode>3"
-  [(match_operand:V_FP 0 "gcn_valu_dst_operand")
-   (match_operand:V_FP 1 "gcn_valu_src0_operand")
-   (match_operand:V_FP 2 "gcn_valu_src0_operand")]
-  "flag_reciprocal_math"
+  [(match_operand:SV_SFDF 0 "register_operand")
+   (match_operand:SV_SFDF 1 "gcn_alu_operand")
+   (match_operand:SV_SFDF 2 "gcn_alu_operand")]
+  ""
   {
+    rtx numerator = operands[1];
+    rtx denominator = operands[2];
+
+    /* Scale the inputs if they are close to the FP limits.
+       This will be reversed later.  */
+    rtx vcc = gen_reg_rtx (DImode);
+    rtx discardedvcc = gen_reg_rtx (DImode);
+    rtx scaled_numerator = gen_reg_rtx (<MODE>mode);
+    rtx scaled_denominator = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_div_scale<mode> (scaled_denominator,
+				    denominator, numerator,
+				    denominator, discardedvcc));
+    emit_insn (gen_div_scale<mode> (scaled_numerator,
+				    denominator, numerator,
+				    numerator, vcc));
+
+    /* Find the reciprocal of the denominator, and use Newton-Raphson to
+       improve the accuracy over the basic hardware instruction.  */
     rtx one = gcn_vec_constant (<MODE>mode,
 		  const_double_from_real_value (dconst1, <SCALAR_MODE>mode));
     rtx initrcp = gen_reg_rtx (<MODE>mode);
-    rtx fma = gen_reg_rtx (<MODE>mode);
-    rtx rcp;
-    rtx num = operands[1], denom = operands[2];
-
-    bool is_rcp = (GET_CODE (num) == CONST_VECTOR
-		   && real_identical
-		        (CONST_DOUBLE_REAL_VALUE
-			  (CONST_VECTOR_ELT (num, 0)), &dconstm1));
-
-    if (is_rcp)
-      rcp = operands[0];
-    else
-      rcp = gen_reg_rtx (<MODE>mode);
-
-    emit_insn (gen_recip<mode>2 (initrcp, denom));
-    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, denom, one));
-    emit_insn (gen_fma<mode>4 (rcp, fma, initrcp, initrcp));
-
-    if (!is_rcp)
-      {
-	rtx div_est = gen_reg_rtx (<MODE>mode);
-	rtx fma2 = gen_reg_rtx (<MODE>mode);
-	rtx fma3 = gen_reg_rtx (<MODE>mode);
-	rtx fma4 = gen_reg_rtx (<MODE>mode);
-	emit_insn (gen_mul<mode>3 (div_est, num, rcp));
-	emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, denom, num));
-	emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
-	emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, denom, num));
-	emit_insn (gen_fma<mode>4 (operands[0], fma4, rcp, fma3));
-      }
-
-    DONE;
-  })
-
-(define_expand "div<mode>3"
-  [(match_operand:FP 0 "gcn_valu_dst_operand")
-   (match_operand:FP 1 "gcn_valu_src0_operand")
-   (match_operand:FP 2 "gcn_valu_src0_operand")]
-  "flag_reciprocal_math"
-  {
-    rtx one = const_double_from_real_value (dconst1, <MODE>mode);
-    rtx initrcp = gen_reg_rtx (<MODE>mode);
-    rtx fma = gen_reg_rtx (<MODE>mode);
-    rtx rcp;
-    rtx num = operands[1], denom = operands[2];
-
-    bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE
-		   && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]),
-				      &dconstm1));
-
-    if (is_rcp)
-      rcp = operands[0];
-    else
-      rcp = gen_reg_rtx (<MODE>mode);
-
-    emit_insn (gen_recip<mode>2 (initrcp, denom));
-    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, denom, one));
-    emit_insn (gen_fma<mode>4 (rcp, fma, initrcp, initrcp));
-
-    if (!is_rcp)
-      {
-	rtx div_est = gen_reg_rtx (<MODE>mode);
-	rtx fma2 = gen_reg_rtx (<MODE>mode);
-	rtx fma3 = gen_reg_rtx (<MODE>mode);
-	rtx fma4 = gen_reg_rtx (<MODE>mode);
-	emit_insn (gen_mul<mode>3 (div_est, num, rcp));
-	emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, denom, num));
-	emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
-	emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, denom, num));
-	emit_insn (gen_fma<mode>4 (operands[0], fma4, rcp, fma3));
-      }
-
+    rtx fma1 = gen_reg_rtx (<MODE>mode);
+    rtx rcp = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_recip<mode>2 (initrcp, scaled_denominator));
+    emit_insn (gen_fma<mode>4_negop2 (fma1, initrcp, scaled_denominator, one));
+    emit_insn (gen_fma<mode>4 (rcp, fma1, initrcp, initrcp));
+
+    /* Do the division "a/b" via "a*1/b" and use Newton-Raphson to improve
+       the accuracy.  The "div_fmas" instruction reverses any scaling
+       performed by "div_scale", above.  */
+    rtx div_est = gen_reg_rtx (<MODE>mode);
+    rtx fma2 = gen_reg_rtx (<MODE>mode);
+    rtx fma3 = gen_reg_rtx (<MODE>mode);
+    rtx fma4 = gen_reg_rtx (<MODE>mode);
+    rtx fmas = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_mul<mode>3 (div_est, scaled_numerator, rcp));
+    emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, scaled_denominator,
+				      scaled_numerator));
+    emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
+    emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, scaled_denominator,
+				      scaled_numerator));
+    emit_insn (gen_div_fmas<mode> (fmas, fma4, rcp, fma3, vcc));
+
+    /* Finally, use "div_fixup" to get the details right and find errors.  */
+    emit_insn (gen_div_fixup<mode> (operands[0], fmas, denominator,
+				    numerator));
     DONE;
   })
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index a7d278cd2f8..5608d85a1a0 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5840,6 +5840,7 @@ gcn_md_reorg (void)
       attr_type itype = get_attr_type (insn);
       attr_unit iunit = get_attr_unit (insn);
       attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
+      int ivccwait = get_attr_vccwait (insn);
       HARD_REG_SET ireads, iwrites;
       CLEAR_HARD_REG_SET (ireads);
       CLEAR_HARD_REG_SET (iwrites);
@@ -5917,6 +5918,14 @@ gcn_md_reorg (void)
 	      && ((hard_reg_set_intersect_p
 		   (prev_insn->reads, iwrites))))
 	    nops_rqd = 1 - prev_insn->age;
+
+	  /* Instruction that requires VCC is not written too close before
+	     using it.  */
+	  if (prev_insn->age < ivccwait
+	      && (hard_reg_set_intersect_p
+		  (prev_insn->writes,
+		   reg_class_contents[(int)VCC_CONDITIONAL_REG])))
+	    nops_rqd = ivccwait - prev_insn->age;
 	}
 
       /* Insert the required number of NOPs.  */
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index c90303c54b5..7065acf402b 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -90,7 +90,8 @@
   UNSPEC_RCP
   UNSPEC_FLBIT_INT
   UNSPEC_FLOOR UNSPEC_CEIL UNSPEC_SIN UNSPEC_COS UNSPEC_EXP2 UNSPEC_LOG2
-  UNSPEC_LDEXP UNSPEC_FREXP_EXP UNSPEC_FREXP_MANT])
+  UNSPEC_LDEXP UNSPEC_FREXP_EXP UNSPEC_FREXP_MANT
+  UNSPEC_DIV_SCALE UNSPEC_DIV_FMAS UNSPEC_DIV_FIXUP])
 
 ;; }}}
 ;; {{{ Attributes
@@ -302,6 +303,11 @@
 
 (define_attr "delayeduse" "yes,no" (const_string "no"))
 
+; Identify instructions that require "Manually Inserted Wait State" if
+; a previous instruction writes to VCC.  The number gives the number of NOPs.
+
+(define_attr "vccwait" "" (const_int 0))
+
 ;; }}}
 ;; {{{ Iterators useful across the wole machine description
 
diff --git a/gcc/testsuite/gcc.target/gcn/fpdiv.c b/gcc/testsuite/gcc.target/gcn/fpdiv.c
index 7125b6f6ba0..936d39cf98e 100644
--- a/gcc/testsuite/gcc.target/gcn/fpdiv.c
+++ b/gcc/testsuite/gcc.target/gcn/fpdiv.c
@@ -1,5 +1,4 @@
 /* { dg-do run } */
-/* { dg-options "-ffast-math" } */
 
 #include <stdio.h>
 #include <stdlib.h>