Patch to change IA64 division code

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* Patch to change IA64 division code
@ 2007-02-21 21:37 Steve Ellcey
  2007-03-20  3:33 ` Jim Wilson
  0 siblings, 1 reply; 7+ messages in thread
From: Steve Ellcey @ 2007-02-21 21:37 UTC (permalink / raw)
  To: gcc-patches, wilson

The current implementation of floating point division on IA64 is
sub-optimal in that it is done with a post-reload split that doesn't
allow for very good instruction scheduling.  This patch allows for the
expansion of the division sequence earlier and thus allows for better
instruction scheduling.  I did some testing with SPEC2000 fp and got
some improvements with this change, ignoring art which had a lot of
variation (both good and bad) when I ran it, I got a less than 1%
slowdown in 3 tests and a less than 1% speed up in 7 tests.  173.applu
sped up by a little over 1%, 200.sixtrack sped up by a little less than
4% and 301.apsi sped up by a little more than 4%.  I got similar results
on HP-UX in LP64 mode and I got slightly better results on HP-UX in
ILP32 mode.

I did one SPEC2006 fp run on HP-UX in LP64 mode and had 3 tests with
less than 1% slowdown and 6 with less than 1% speed up. bwaves sped up
6%, lbm by 4%, and zuesmp, gromacs, wrf, and sphinx3 sped up by 1 to
2%.

This patch only converts the maximum throughput versions of single and
double precision floating point division, if it is approved I will
commit to converting the minimum latency versions to this same setup as
well but I would like to get this patch approved as an intermediate step
and to make sure the overall approach is acceptable.

Tested on IA64 HP-UX and Linux with no regressions.

OK to checkin?

Steve Ellcey
sje@cup.hp.com


2007-02-21  Steve Ellcey  <sje@cup.hp.com>

	* config/ia64/ia64.h (HARD_REGNO_NREGS): Handle RFmode.
	(HARD_REGNO_MODE_OK): Ditto.
	(MODES_TIEABLE_P): Ditto.
	(HARD_REGNO_CALLER_SAVE_MODE): Ditto.
	(CLASS_MAX_NREGS): Ditto.
	* config/ia64/ia64.c (ia64_print_operand_address): Add R format.
	(rtx_needs_barrier): Add UNSPEC_NOP_CONVERT case.
	* config/ia64/ia64.md (UNSPEC_NOP_CONVERT): New.
	(divsf3_internal_thr): Removed.
	(divdf3_internal_thr): Removed.
	* config/ia64/div.md: New file.

Index: config/ia64/ia64.h
===================================================================
--- config/ia64/ia64.h	(revision 122189)
+++ config/ia64/ia64.h	(working copy)
@@ -1,5 +1,5 @@
 /* Definitions of target machine GNU compiler.  IA-64 version.
-   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
    Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com> and
    		  David Mosberger <davidm@hpl.hp.com>.
@@ -642,6 +642,7 @@ while (0)
    : PR_REGNO_P (REGNO) && (MODE) == BImode ? 2				\
    : PR_REGNO_P (REGNO) && (MODE) == CCImode ? 1			\
    : FR_REGNO_P (REGNO) && (MODE) == XFmode ? 1				\
+   : FR_REGNO_P (REGNO) && (MODE) == RFmode ? 1				\
    : FR_REGNO_P (REGNO) && (MODE) == XCmode ? 2				\
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
@@ -657,7 +658,7 @@ while (0)
    : PR_REGNO_P (REGNO) ?					\
      (MODE) == BImode || GET_MODE_CLASS (MODE) == MODE_CC	\
    : GR_REGNO_P (REGNO) ?					\
-     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode	\
+     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode	&& (MODE) != RFmode \
    : AR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : BR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : 0)
@@ -674,15 +675,15 @@ while (0)
    we can't tie it with any other modes.  */
 #define MODES_TIEABLE_P(MODE1, MODE2)			\
   (GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2)	\
-   && ((((MODE1) == XFmode) || ((MODE1) == XCmode))	\
-       == (((MODE2) == XFmode) || ((MODE2) == XCmode)))	\
+   && ((((MODE1) == XFmode) || ((MODE1) == XCmode) || ((MODE1) == RFmode))     \
+       == (((MODE2) == XFmode) || ((MODE2) == XCmode) || ((MODE1) == RFmode))) \
    && (((MODE1) == BImode) == ((MODE2) == BImode)))
 
 /* Specify the modes required to caller save a given hard regno.
    We need to ensure floating pt regs are not saved as DImode.  */
 
 #define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE) \
-  ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? XFmode        \
+  ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? RFmode        \
    : choose_hard_reg_mode ((REGNO), (NREGS), false))
 \f
 /* Handling Leaf Functions */
@@ -896,6 +897,7 @@ enum reg_class
 #define CLASS_MAX_NREGS(CLASS, MODE) \
   ((MODE) == BImode && (CLASS) == PR_REGS ? 2			\
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XFmode) ? 1 \
+   : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == RFmode) ? 1 \
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XCmode) ? 2 \
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
Index: config/ia64/ia64.c
===================================================================
--- config/ia64/ia64.c	(revision 122189)
+++ config/ia64/ia64.c	(working copy)
@@ -1,5 +1,5 @@
 /* Definitions of target machine for GNU compiler.
-   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
    Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com> and
 		  David Mosberger <davidm@hpl.hp.com>.
@@ -4508,6 +4508,7 @@ ia64_print_operand_address (FILE * strea
    O	Append .acq for volatile load.
    P	Postincrement of a MEM.
    Q	Append .rel for volatile store.
+   R    Print .s .d or nothing for a single, double or no truncation.
    S	Shift amount for shladd instruction.
    T	Print an 8-bit sign extended number (K) as a 32-bit unsigned number
 	for Intel assembler.
@@ -4648,6 +4649,17 @@ ia64_print_operand (FILE * file, rtx x, 
 	fputs(".rel", file);
       return;
 
+    case 'R':
+      if (x == CONST0_RTX (GET_MODE (x)))
+	fputs(".s", file);
+      else if (x == CONST1_RTX (GET_MODE (x)))
+	fputs(".d", file);
+      else if (x == CONST2_RTX (GET_MODE (x)))
+        ;
+      else
+        output_operand_lossage ("invalid %%R value");
+      return;
+
     case 'S':
       fprintf (file, "%d", exact_log2 (INTVAL (x)));
       return;
@@ -5793,6 +5805,7 @@ rtx_needs_barrier (rtx x, struct reg_fla
 	case UNSPEC_LDSA:
 	case UNSPEC_CHKACLR:
         case UNSPEC_CHKS:
+	case UNSPEC_NOP_CONVERT:
 	  need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
 	  break;
 
Index: config/ia64/ia64.md
===================================================================
--- config/ia64/ia64.md	(revision 122189)
+++ config/ia64/ia64.md	(working copy)
@@ -81,6 +81,7 @@ (define_constants
    (UNSPEC_SHRP			29)
    (UNSPEC_COPYSIGN		30)
    (UNSPEC_VECT_EXTR		31)
+   (UNSPEC_NOP_CONVERT		32)
    (UNSPEC_LDA                  40)
    (UNSPEC_LDS                  41)
    (UNSPEC_LDSA                 42)
@@ -3108,60 +3109,6 @@ (define_insn_and_split "divsf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divsf3_internal_thr"
-  [(set (match_operand:SF 0 "fr_register_operand" "=&f")
-	(div:SF (match_operand:SF 1 "fr_register_operand" "f")
-		(match_operand:SF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:XF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 3))
-			      (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:SF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:XF (match_dup 7)
-			       (mult:XF (match_dup 8) (match_dup 3))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (float_truncate:SF
-	    (plus:XF (mult:XF (match_dup 4) (match_dup 6))
-			      (match_dup 3)))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (SFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_insn "*sqrt_approx"
@@ -3614,72 +3561,6 @@ (define_insn_and_split "divdf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divdf3_internal_thr"
-  [(set (match_operand:DF 0 "fr_register_operand" "=&f")
-	(div:DF (match_operand:DF 1 "fr_register_operand" "f")
-		(match_operand:DF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:DF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:DF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:DF (match_dup 1)
-			       (mult:DF (match_dup 2) (match_dup 9))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (plus:DF (mult:DF (match_dup 4) (match_dup 0))
-			    (match_dup 9))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (DFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_expand "sqrtdf2"
@@ -6540,3 +6421,5 @@ (define_insn "ip_value"
 (include "vect.md")
 ;; Atomic operations
 (include "sync.md")
+;; New division operations
+(include "div.md")
Index: config/ia64/div.md
===================================================================
--- config/ia64/div.md	(revision 0)
+++ config/ia64/div.md	(revision 0)
@@ -0,0 +1,575 @@
+;; IA-64 machine description for inline division operations.
+;; Copyright (C) 2007
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+
+;; For the internal conditional math routines:
+
+;; _a versions are when we want output to be op 0 if predicate is false.
+;; _b versions are when we don't care about output if predicate is false.
+
+;; operand 0 is always the result
+;; operand 1 is always the predicate
+;; operand 2, 3, and sometimes 4 are the input values.
+;; operand 4 or 5 is the floating point status register to use.
+;; operand 5 or 6 is the rounding to do. (0 = single, 1 = double, 2 = none)
+;;
+;; addrf3_cond_[ab]   - F0 = F2 + F3
+;; subrf3_cond_[ab]   - F0 = F2 - F3
+;; mulrf3_cond_[ab]   - F0 = F2 * F3
+;; nmulrf3_cond_[ab]  - F0 = - (F2 * F3)
+;; m1addrf4_cond_[ab] - F0 = (F2 * F3) + F4
+;; m1subrf4_cond_[ab] - F0 = (F2 * F3) - F4
+;; m2addrf4_cond_[ab] - F0 = F2 + (F3 * F4)
+;; m2subrf4_cond_[ab] - F0 = F2 - (F3 * F4)
+
+;; Basic plus/minus/mult operations
+
+(define_insn "*addrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fadd%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*addrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fadd%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*subrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fsub%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*subrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fsub%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*mulrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*mulrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fmpy%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; neg-mult operations
+
+(define_insn "*nmulrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (neg:RF (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fnmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*nmulrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (neg:RF (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fnmpy%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op1)
+
+(define_insn "*m1addrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m1addrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R6.s%5 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m1subrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fms%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m1subrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fms%R6.s%5 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op2)
+
+(define_insn "*m2addrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m2addrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R6.s%5 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m2subrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fnma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m2subrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fnma%R6.s%5 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; Conversions to/from RF and SF/DF/XF
+
+(define_mode_macro SDX_F [SF DF XF])
+
+(define_insn "*mov_trunc<mode>rf"
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "=f")
+        (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "f")]
+                      UNSPEC_NOP_CONVERT))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+
+(define_insn "*mov_extendrf<mode>"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "f")]
+                   UNSPEC_NOP_CONVERT))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+(define_split
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "")
+        (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "")]
+                      UNSPEC_NOP_CONVERT))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+})
+
+(define_split
+  [(set (match_operand:RF 0 "fr_register_operand" "")
+        (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "")]
+                      UNSPEC_NOP_CONVERT))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (RFmode, REGNO (operands[1]));
+})
+
+;; Reciprical approximation
+
+(define_insn "*recip_approx_rf"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (div:RF (match_operand:RF 2 "fr_register_operand" "f")
+                (match_operand:RF 3 "fr_register_operand" "f")))
+   (set (match_operand:BI 1 "register_operand" "=c")
+        (unspec:BI [(match_dup 2) (match_dup 3)] UNSPEC_FR_RECIP_APPROX))
+   (use (match_operand:SI 4 "const_int_operand" ""))]
+  ""
+  "frcpa.s%4 %0, %1 = %2, %3"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "no")])
+
+;; Single precision floating point division (high throughput)
+;; The algorithm:
+;;	y = 1 / b			OP3 = 1 / OP5
+;;	e = 1 - (b * y)			OP6 = OP15 - (OP5 * OP3)
+;;	y1 = y + (y * e)		OP7 = OP3 + (OP3 * OP6)
+;;	y2 = y + (y1 * e)		OP8 = OP3 + (OP7 * OP6)
+;;	q = single(a * y2)		OP9 = single(OP4 * OP8)
+;;	r = a - (q * b)        		OP10 = OP4 - (OP9 * OP5)
+;;	Q = single (q + (r * y2))	OP3 = single (OP9 + (OP10 * OP8))
+
+(define_expand "divsf3_internal_thr"
+  [
+
+;; Empty conversions to put inputs into RFmode
+
+   (set (match_dup 4)
+	(unspec:RF [(match_operand:SF 1 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+   (set (match_dup 5)
+	(unspec:RF [(match_operand:SF 2 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+
+;; y = 1 / b		OP3 = 1 / OP5
+   (parallel [(set (match_dup 3) (div:RF (match_dup 4) (match_dup 5)))
+              (set (match_dup 11) (unspec:BI [(match_dup 4) (match_dup 5)] UNSPEC_FR_RECIP_APPROX))
+              (use (match_dup 12))])
+
+;; e = 1 - (b * y)		OP6 = OP15 - (OP5 * OP3)
+   (parallel [(set (match_dup 6)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (minus:RF (match_dup 15)
+                               (mult:RF (match_dup 5) (match_dup 3)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; y1 = y + (y * e)		OP7 = OP3 + (OP3 * OP6)
+   (parallel [(set (match_dup 7)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (plus:RF (match_dup 3)
+                              (mult:RF (match_dup 3) (match_dup 6)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; y2 = y + (y1 * e)		OP8 = OP3 + (OP7 * OP6)
+   (parallel [(set (match_dup 8)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (plus:RF (match_dup 3)
+                              (mult:RF (match_dup 7) (match_dup 6)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; q = single(a * y2)		OP9 = single(OP4 * OP8)
+   (parallel [(set (match_dup 9)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (mult:RF (match_dup 4) (match_dup 8))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 12))])
+;; r = a - (q * b)		OP10 = OP4 - (OP9 * OP5)
+   (parallel [(set (match_dup 10)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (minus:RF (match_dup 4)
+                               (mult:RF (match_dup 9) (match_dup 5)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; Q = single (q + (r * y2))	OP3 = single (OP9 + (OP10 * OP8))
+   (parallel [(set (match_dup 3)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (plus:RF (match_dup 9)
+                              (mult:RF (match_dup 10) (match_dup 8)))
+                     (match_dup 3)))
+              (use (match_dup 12)) (use (match_dup 12))])
+
+   (set (match_operand:SF 0 "fr_register_operand" "=f")
+	(unspec:SF [(match_dup 3)] UNSPEC_NOP_CONVERT))
+  ]
+  ""
+{
+  operands[3]  = gen_reg_rtx (RFmode);
+  operands[4]  = gen_reg_rtx (RFmode);
+  operands[5]  = gen_reg_rtx (RFmode);
+  operands[6]  = gen_reg_rtx (RFmode);
+  operands[7]  = gen_reg_rtx (RFmode);
+  operands[8]  = gen_reg_rtx (RFmode);
+  operands[9]  = gen_reg_rtx (RFmode);
+  operands[10] = gen_reg_rtx (RFmode);
+  operands[11] = gen_reg_rtx (BImode);
+  operands[12] = CONST0_RTX (SImode);
+  operands[13] = CONST1_RTX (SImode);
+  operands[14] = CONST2_RTX (SImode);
+  operands[15] = CONST1_RTX (RFmode);
+})
+
+
+;; Double precision floating point division (high throughput)
+;; The algorithm:
+;;	y  = 1 / b			OP3  = 1 / OP5
+;;	e  = 1 - (b * y)		OP6  = OP18 - (OP5 * OP3)
+;;	y1 = y + (y * e)		OP7  = OP3 + (OP3 * OP6)
+;;	e1 = e * e			OP8  = OP6 * OP6
+;;	y2 = y1 + (y1 * e1)		OP9  = OP7 + (OP7 * OP8)
+;;	e2 = e1 * e1			OP10 = OP8 * OP8
+;;	y3 = y2 + (y2 * e2)		OP11 = OP9 + (OP9 * OP10)
+;;	q  = double (a * y3)		OP12 = double (OP4 * OP11)
+;;	r  = a - (b * q)		OP13 = OP4 - (OP5 * OP12)
+;;	Q  = double (q + (r * y3)	OP3  = double (OP12 + (OP13 * OP11))
+
+(define_expand "divdf3_internal_thr"
+  [
+
+;; Empty conversions to put inputs into RFmode
+
+   (set (match_dup 4)
+	(unspec:RF [(match_operand:DF 1 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+   (set (match_dup 5)
+	(unspec:RF [(match_operand:DF 2 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+
+;;	y  = 1 / b			OP3  = 1 / OP5
+   (parallel [(set (match_dup 3) (div:RF (match_dup 4) (match_dup 5)))
+              (set (match_dup 14) (unspec:BI [(match_dup 4) (match_dup 5)] UNSPEC_FR_RECIP_APPROX))
+              (use (match_dup 15))])
+
+;;	e  = 1 - (b * y)		OP6  = OP18 - (OP5 * OP3)
+   (parallel [(set (match_dup 6)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (minus:RF (match_dup 18)
+                               (mult:RF (match_dup 5) (match_dup 3)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	y1 = y + (y * e)		OP7  = OP3 + (OP3 * OP6)
+   (parallel [(set (match_dup 7)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 3)
+                              (mult:RF (match_dup 3) (match_dup 6)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	e1 = e * e			OP8  = OP6 * OP6
+   (parallel [(set (match_dup 8)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (mult:RF (match_dup 6) (match_dup 6))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	y2 = y1 + (y1 * e1)		OP9  = OP7 + (OP7 * OP8)
+   (parallel [(set (match_dup 9)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 7)
+                              (mult:RF (match_dup 7) (match_dup 8)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	e2 = e1 * e1			OP10 = OP8 * OP8
+   (parallel [(set (match_dup 10)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (mult:RF (match_dup 8) (match_dup 8))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	y3 = y2 + (y2 * e2)		OP11 = OP9 + (OP9 * OP10)
+   (parallel [(set (match_dup 11)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 9)
+                              (mult:RF (match_dup 9) (match_dup 10)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	q  = double (a * y3)		OP12 = double (OP4 * OP11)
+   (parallel [(set (match_dup 12)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (mult:RF (match_dup 4) (match_dup 11))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 16))])
+
+;;	r  = a - (b * q)		OP13 = OP4 - (OP5 * OP12)
+   (parallel [(set (match_dup 13)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (minus:RF (match_dup 4)
+                               (mult:RF (match_dup 5) (match_dup 12)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	Q  = double (q + (r * y3))	OP3  = double (OP12 + (OP13 * OP11))
+   (parallel [(set (match_dup 3)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 12)
+                              (mult:RF (match_dup 13) (match_dup 11)))
+                     (match_dup 3)))
+              (use (match_dup 15)) (use (match_dup 16))])
+
+;; Do an 'empty' conversion back to SFmode
+
+   (set (match_operand:DF 0 "fr_register_operand" "=f")
+	(unspec:DF [(match_dup 3)] UNSPEC_NOP_CONVERT))
+  ]
+""
+{
+  operands[3]  = gen_reg_rtx (RFmode);
+  operands[4]  = gen_reg_rtx (RFmode);
+  operands[5]  = gen_reg_rtx (RFmode);
+  operands[6]  = gen_reg_rtx (RFmode);
+  operands[7]  = gen_reg_rtx (RFmode);
+  operands[8]  = gen_reg_rtx (RFmode);
+  operands[9]  = gen_reg_rtx (RFmode);
+  operands[10] = gen_reg_rtx (RFmode);
+  operands[11] = gen_reg_rtx (RFmode);
+  operands[12] = gen_reg_rtx (RFmode);
+  operands[13] = gen_reg_rtx (RFmode);
+  operands[14] = gen_reg_rtx (BImode);
+  operands[15] = CONST0_RTX (SImode);
+  operands[16] = CONST1_RTX (SImode);
+  operands[17] = CONST2_RTX (SImode);
+  operands[18] = CONST1_RTX (RFmode);
+})

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Patch to change IA64 division code
  2007-02-21 21:37 Patch to change IA64 division code Steve Ellcey
@ 2007-03-20  3:33 ` Jim Wilson
  2007-03-20 17:12   ` Steve Ellcey
  2007-03-27 17:33   ` Steve Ellcey
  0 siblings, 2 replies; 7+ messages in thread
From: Jim Wilson @ 2007-03-20  3:33 UTC (permalink / raw)
  To: sje; +Cc: gcc-patches, wilson

On Wed, 2007-02-21 at 09:16 -0800, Steve Ellcey wrote:
> The current implementation of floating point division on IA64 is
> sub-optimal in that it is done with a post-reload split that doesn't
> allow for very good instruction scheduling.  This patch allows for the
> expansion of the division sequence earlier and thus allows for better
> instruction scheduling.

This seems pretty reasonable.

It isn't clear why you have the _a and _b variants in the div.md file.
Especially since they generate identical code.  This looks like
unnecessary duplication.

Using (const_int 0) in a pattern that expects an FP number looks wrong
to me.  Better to use CONST0_RTX (RFmode) which will be a const_double I
think.

If we really need both patterns, then it would be simpler to have a
common pattern.  You can use fr_reg_or_0_operand to match either an fr
reg or a constant 0.  This would require using a proper FP constant for
0.

It isn't clear why you have UNSPEC_NOP_CONVERT stuff to convert to/from
RFmode.  Especially since you just split them after reload into a nop
move.  I presume there is a reason for this.  A comment to explain this
reason should be added so the code makes more sense.

Similarly, it isn't clear why UNSPEC_NOP_CONVERT requires an rtx
barrier, when it is going to go away later.

The long divide expanders include a lot of explicit rtl.  You could
alternatively call gen expander functions.  So instead of having
+   (parallel [(set (match_dup 6)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int
0))
+                     (minus:RF (match_dup 18)
+                               (mult:RF (match_dup 5) (match_dup 3)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
You can do this
  gen_subdf3_cond_b (operands[6], operands[14], operands[18],    
         operands[5], operands[3], operands[16], operands[17]);
This is smaller, and might be more readable, though it is a matter of
taste and isn't really that much better than what you have.  An
advantage of my suggestion is that if you have to change the rtl for the
subdf3_cond_b pattern, then you only have to fix it once.  Whereas, with
your code, you have to fix it in 3 places, where it is defined, and in
the two copies in the divide expanders.  This gets worse when you add
more divide expanders, as then you will have even more copies.  This is
an argument against using your method.  This is how udivsi3 works for
instance if you want to look at an example.  Another advantage of my
suggestion is that you can give meaningful names to the operands.  So
instead of operands[16] you can have round_double which is more readable
and might help reduce errors.  You can also give the intermediate
operands names like "y" and "e" to match the comments in your rtl
expanders.  Another obscure consideration here is that patterns with a
large number of operands will increase the size of some internal arrays
and structures, which may make gcc run slower.  See for instance
MAX_DUP_OPERANDS in insn-config.h, created by genconfig.  I have 14 in
my copy, which is due to the divdf_internal_lat pattern.  This number
will be even higher with your patch.  But this is not a problem if we
call gen* functions to expand the rtl.  I can accept either alternative
here, I just wanted you to consider the options.

It isn't clear why you changed HARD_REGNO_CALLER_SAVE_MODE from XFmode
to RFmode.  This is just going to add useless conversions to/from RFmode
that will later be replaced with nop moves.  So why bother?  I am
concerned that this might hurt performance.

It does appear that there might be a problem if we need to save/restore
one of these RFmode regs across a call.  In that case, the only save
insn is a fr_spill and fr_restore, but your code doesn't expand into
that anywhere.  Maybe this is why you have the unspec nop moves?  But if
so, that that means the HARD_REGNO_CALLER_SAVE_MODE change is
unnecessary, because we won't have any RFmode regs live across a call.

At the end of the DFmode divide expander, you have a comment
+;; Do an 'empty' conversion back to SFmode
which should obviously be DFmode instead of SFmode.  But curiously there
is no similar comment in the SFmode divide expander.
-- 
Jim Wilson, GNU Tools Support, http://www.specifix.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Patch to change IA64 division code
  2007-03-20  3:33 ` Jim Wilson
@ 2007-03-20 17:12   ` Steve Ellcey
  2007-03-27 17:33   ` Steve Ellcey
  1 sibling, 0 replies; 7+ messages in thread
From: Steve Ellcey @ 2007-03-20 17:12 UTC (permalink / raw)
  To: wilson; +Cc: gcc-patches

> This seems pretty reasonable.
> 
> It isn't clear why you have the _a and _b variants in the div.md file.
> Especially since they generate identical code.  This looks like
> unnecessary duplication.
> 
> Using (const_int 0) in a pattern that expects an FP number looks wrong
> to me.  Better to use CONST0_RTX (RFmode) which will be a const_double I
> think.
>
> If we really need both patterns, then it would be simpler to have a
> common pattern.  You can use fr_reg_or_0_operand to match either an fr
> reg or a constant 0.  This would require using a proper FP constant for
> 0.

I think we can get rid of the duplicate patterns by using CONST0_RTX
(RFmode) and fr_reg_or_0_operand.  I will work on making this change.

> It isn't clear why you have UNSPEC_NOP_CONVERT stuff to convert to/from
> RFmode.  Especially since you just split them after reload into a nop
> move.  I presume there is a reason for this.  A comment to explain this
> reason should be added so the code makes more sense.

Since all the predicated math instructions expect RFmode inputs and
generate RFmode outputs there needs to be some way to convert the
original SFmode/DFmode inputs into RFmode (and the final result back
into SFmode or DFmode).  Otherwise the predicated math instructions need
to handle SFmode and DFmode and RFmode inputs and outputs.  I will add
some comments for this.

> Similarly, it isn't clear why UNSPEC_NOP_CONVERT requires an rtx
> barrier, when it is going to go away later.

I think this is wrong and I shouldn't have the barrier.

> The long divide expanders include a lot of explicit rtl.  You could
> alternatively call gen expander functions.  So instead of having
> +   (parallel [(set (match_dup 6)
> +                   (if_then_else:RF (ne:RF (match_dup 14) (const_int
> 0))
> +                     (minus:RF (match_dup 18)
> +                               (mult:RF (match_dup 5) (match_dup 3)))
> +                     (const_int 0)))
> +              (use (match_dup 16)) (use (match_dup 17))])
> You can do this
>   gen_subdf3_cond_b (operands[6], operands[14], operands[18],    
>          operands[5], operands[3], operands[16], operands[17]);
> This is smaller, and might be more readable, though it is a matter of
> taste and isn't really that much better than what you have.  An
> advantage of my suggestion is that if you have to change the rtl for the
> subdf3_cond_b pattern, then you only have to fix it once.  Whereas, with
> your code, you have to fix it in 3 places, where it is defined, and in
> the two copies in the divide expanders.  This gets worse when you add
> more divide expanders, as then you will have even more copies.  This is
> an argument against using your method.  This is how udivsi3 works for
> instance if you want to look at an example.  Another advantage of my
> suggestion is that you can give meaningful names to the operands.  So
> instead of operands[16] you can have round_double which is more readable
> and might help reduce errors.  You can also give the intermediate
> operands names like "y" and "e" to match the comments in your rtl
> expanders.  Another obscure consideration here is that patterns with a
> large number of operands will increase the size of some internal arrays
> and structures, which may make gcc run slower.  See for instance
> MAX_DUP_OPERANDS in insn-config.h, created by genconfig.  I have 14 in
> my copy, which is due to the divdf_internal_lat pattern.  This number
> will be even higher with your patch.  But this is not a problem if we
> call gen* functions to expand the rtl.  I can accept either alternative
> here, I just wanted you to consider the options.

I have tried writing the expanders both ways but never had a strong
feeling about one vs.  the other but the idea of being able to use more
meaningful variable names does sound like a good idea.  I will consider
this some more.

> It isn't clear why you changed HARD_REGNO_CALLER_SAVE_MODE from XFmode
> to RFmode.  This is just going to add useless conversions to/from RFmode
> that will later be replaced with nop moves.  So why bother?  I am
> concerned that this might hurt performance.
> 
> It does appear that there might be a problem if we need to save/restore
> one of these RFmode regs across a call.  In that case, the only save
> insn is a fr_spill and fr_restore, but your code doesn't expand into
> that anywhere.  Maybe this is why you have the unspec nop moves?  But if
> so, that that means the HARD_REGNO_CALLER_SAVE_MODE change is
> unnecessary, because we won't have any RFmode regs live across a call.

What I ran into (a while back so I hope I remember it correctly) was
that since the division code sequence was getting expanded earlier
(before reload) I wound up with one of the registers being used getting
spilled by reload (I think).  If you spill and fill in XFmode you do not
get exactly the same results back (80 bits vs.  82 bits).  If you spill
and fill in RFmode then you always get the exact original register
contents back.  This was affecting the division sequence where we don't
do intermediate rounding to SFmode or DFmode (or XFmode) during the
operations.

> At the end of the DFmode divide expander, you have a comment
> +;; Do an 'empty' conversion back to SFmode
> which should obviously be DFmode instead of SFmode.  But curiously there
> is no similar comment in the SFmode divide expander.

I will work on making these changes and prepare a new patch in a few
weeks.

Steve Ellcey
sje@cup.hp.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Patch to change IA64 division code
  2007-03-20  3:33 ` Jim Wilson
  2007-03-20 17:12   ` Steve Ellcey
@ 2007-03-27 17:33   ` Steve Ellcey
  2007-07-03  1:41     ` Jim Wilson
  1 sibling, 1 reply; 7+ messages in thread
From: Steve Ellcey @ 2007-03-27 17:33 UTC (permalink / raw)
  To: wilson; +Cc: gcc-patches

Jim,

Here is a new version of my division change.  I got rid of the _a and _b
variants and changed the division sequences to use gen_* calls.  This
makes div.md smaller and easier to read.  The code sequence generated is
unchanged.

I am still a bit concerned about HARD_REGNO_CALLER_SAVE_MODE.  I changed
it back to XFmode and I got no regressions but it still seems like
RFmode is the 'right' mode to use as it will save and restore a register
without losing any information.  XFmode will lose two bits of precision.
This should only happen if we save and restore an FP register in the
middle of a division code sequence.  A long time ago I thought I saw
that happen but I cannot reproduce it with the current compiler.

Retested with no regressions. OK to check in?


2007-02-27  Steve Ellcey  <sje@cup.hp.com>

	* config/ia64/ia64.h (HARD_REGNO_NREGS): Handle RFmode.
	(HARD_REGNO_MODE_OK): Ditto.
	(MODES_TIEABLE_P): Ditto.
	(HARD_REGNO_CALLER_SAVE_MODE): Ditto.
	(CLASS_MAX_NREGS): Ditto.
	* config/ia64/ia64.c (ia64_print_operand_address): Add R format.
	(rtx_needs_barrier): Add UNSPEC_NOP_CONVERT case.
	* config/ia64/ia64.md (UNSPEC_NOP_CONVERT): New.
	(divsf3_internal_thr): Removed.
	(divdf3_internal_thr): Removed.
	* config/ia64/div.md: New file.


Index: config/ia64/ia64.h
===================================================================
--- config/ia64/ia64.h	(revision 123090)
+++ config/ia64/ia64.h	(working copy)
@@ -642,6 +642,7 @@ while (0)
    : PR_REGNO_P (REGNO) && (MODE) == BImode ? 2				\
    : PR_REGNO_P (REGNO) && (MODE) == CCImode ? 1			\
    : FR_REGNO_P (REGNO) && (MODE) == XFmode ? 1				\
+   : FR_REGNO_P (REGNO) && (MODE) == RFmode ? 1				\
    : FR_REGNO_P (REGNO) && (MODE) == XCmode ? 2				\
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
@@ -657,7 +658,7 @@ while (0)
    : PR_REGNO_P (REGNO) ?					\
      (MODE) == BImode || GET_MODE_CLASS (MODE) == MODE_CC	\
    : GR_REGNO_P (REGNO) ?					\
-     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode	\
+     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode	&& (MODE) != RFmode \
    : AR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : BR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : 0)
@@ -674,8 +675,8 @@ while (0)
    we can't tie it with any other modes.  */
 #define MODES_TIEABLE_P(MODE1, MODE2)			\
   (GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2)	\
-   && ((((MODE1) == XFmode) || ((MODE1) == XCmode))	\
-       == (((MODE2) == XFmode) || ((MODE2) == XCmode)))	\
+   && ((((MODE1) == XFmode) || ((MODE1) == XCmode) || ((MODE1) == RFmode))     \
+       == (((MODE2) == XFmode) || ((MODE2) == XCmode) || ((MODE1) == RFmode))) \
    && (((MODE1) == BImode) == ((MODE2) == BImode)))
 
 /* Specify the modes required to caller save a given hard regno.
@@ -896,6 +897,7 @@ enum reg_class
 #define CLASS_MAX_NREGS(CLASS, MODE) \
   ((MODE) == BImode && (CLASS) == PR_REGS ? 2			\
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XFmode) ? 1 \
+   : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == RFmode) ? 1 \
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XCmode) ? 2 \
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
Index: config/ia64/ia64.c
===================================================================
--- config/ia64/ia64.c	(revision 123090)
+++ config/ia64/ia64.c	(working copy)
@@ -4503,6 +4503,7 @@ ia64_print_operand_address (FILE * strea
    O	Append .acq for volatile load.
    P	Postincrement of a MEM.
    Q	Append .rel for volatile store.
+   R    Print .s .d or nothing for a single, double or no truncation.
    S	Shift amount for shladd instruction.
    T	Print an 8-bit sign extended number (K) as a 32-bit unsigned number
 	for Intel assembler.
@@ -4643,6 +4644,17 @@ ia64_print_operand (FILE * file, rtx x, 
 	fputs(".rel", file);
       return;
 
+    case 'R':
+      if (x == CONST0_RTX (GET_MODE (x)))
+	fputs(".s", file);
+      else if (x == CONST1_RTX (GET_MODE (x)))
+	fputs(".d", file);
+      else if (x == CONST2_RTX (GET_MODE (x)))
+        ;
+      else
+        output_operand_lossage ("invalid %%R value");
+      return;
+
     case 'S':
       fprintf (file, "%d", exact_log2 (INTVAL (x)));
       return;
@@ -5762,6 +5774,7 @@ rtx_needs_barrier (rtx x, struct reg_fla
 	case UNSPEC_BSP_VALUE:
 	case UNSPEC_FLUSHRS:
 	case UNSPEC_BUNDLE_SELECTOR:
+	case UNSPEC_NOP_CONVERT:
           break;
 
 	case UNSPEC_GR_SPILL:
Index: config/ia64/ia64.md
===================================================================
--- config/ia64/ia64.md	(revision 123090)
+++ config/ia64/ia64.md	(working copy)
@@ -81,6 +81,7 @@ (define_constants
    (UNSPEC_SHRP			29)
    (UNSPEC_COPYSIGN		30)
    (UNSPEC_VECT_EXTR		31)
+   (UNSPEC_NOP_CONVERT		32)
    (UNSPEC_LDA                  40)
    (UNSPEC_LDS                  41)
    (UNSPEC_LDSA                 42)
@@ -3108,60 +3109,6 @@ (define_insn_and_split "divsf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divsf3_internal_thr"
-  [(set (match_operand:SF 0 "fr_register_operand" "=&f")
-	(div:SF (match_operand:SF 1 "fr_register_operand" "f")
-		(match_operand:SF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:XF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 3))
-			      (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:SF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:XF (match_dup 7)
-			       (mult:XF (match_dup 8) (match_dup 3))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (float_truncate:SF
-	    (plus:XF (mult:XF (match_dup 4) (match_dup 6))
-			      (match_dup 3)))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (SFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_insn "*sqrt_approx"
@@ -3614,72 +3561,6 @@ (define_insn_and_split "divdf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divdf3_internal_thr"
-  [(set (match_operand:DF 0 "fr_register_operand" "=&f")
-	(div:DF (match_operand:DF 1 "fr_register_operand" "f")
-		(match_operand:DF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:DF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:DF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:DF (match_dup 1)
-			       (mult:DF (match_dup 2) (match_dup 9))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (plus:DF (mult:DF (match_dup 4) (match_dup 0))
-			    (match_dup 9))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (DFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_expand "sqrtdf2"
@@ -6540,3 +6421,5 @@ (define_insn "ip_value"
 (include "vect.md")
 ;; Atomic operations
 (include "sync.md")
+;; New division operations
+(include "div.md")
Index: config/ia64/div.md
===================================================================
--- config/ia64/div.md	(revision 0)
+++ config/ia64/div.md	(revision 0)
@@ -0,0 +1,317 @@
+
+;; For the internal conditional math routines:
+
+;; operand 0 is always the result
+;; operand 1 is always the predicate
+;; operand 2, 3, and sometimes 4 are the input values.
+;; operand 4 or 5 is the floating point status register to use.
+;; operand 5 or 6 is the rounding to do. (0 = single, 1 = double, 2 = none)
+;;
+;; addrf3_cond   - F0 = F2 + F3
+;; subrf3_cond   - F0 = F2 - F3
+;; mulrf3_cond   - F0 = F2 * F3
+;; nmulrf3_cond  - F0 = - (F2 * F3)
+;; m1addrf4_cond - F0 = (F2 * F3) + F4
+;; m1subrf4_cond - F0 = (F2 * F3) - F4
+;; m2addrf4_cond - F0 = F2 + (F3 * F4)
+;; m2subrf4_cond - F0 = F2 - (F3 * F4)
+
+;; Basic plus/minus/mult operations
+
+(define_insn "addrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fadd%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "subrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fsub%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "mulrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; neg-mult operation
+
+(define_insn "nmulrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (neg:RF (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG")))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fnmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op1)
+
+(define_insn "m1addrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (plus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "m1subrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (minus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fms%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op2)
+
+(define_insn "m2addrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG")))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "m2subrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fg,fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fg,fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fg,fG")))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fnma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; Conversions to/from RF and SF/DF/XF
+;; These conversions should not generate any code but make it possible
+;; for all the instructions used to implement floating point division
+;; to be written for RFmode only and to not have to handle multiple
+;; modes or to have to handle a register in more than one mode.
+
+(define_mode_macro SDX_F [SF DF XF])
+
+(define_insn "mov_extendrf<mode>"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "f")]
+                   UNSPEC_NOP_CONVERT))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+(define_split
+  [(set (match_operand:RF 0 "fr_register_operand" "")
+        (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "")]
+                      UNSPEC_NOP_CONVERT))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (RFmode, REGNO (operands[1]));
+})
+
+
+(define_insn "mov_trunc<mode>rf"
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "=f")
+        (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "f")]
+                      UNSPEC_NOP_CONVERT))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+(define_split
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "")
+        (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "")]
+                      UNSPEC_NOP_CONVERT))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+})
+
+;; Reciprical approximation
+
+(define_insn "recip_approx_rf"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (div:RF (match_operand:RF 1 "fr_register_operand" "f")
+                (match_operand:RF 2 "fr_register_operand" "f")))
+   (set (match_operand:BI 3 "register_operand" "=c")
+        (unspec:BI [(match_dup 1) (match_dup 2)] UNSPEC_FR_RECIP_APPROX))
+   (use (match_operand:SI 4 "const_int_operand" ""))]
+  ""
+  "frcpa.s%4 %0, %3 = %1, %2"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "no")])
+
+;; Single precision floating point division (maximum throughput algorithm).
+
+(define_expand "divsf3_internal_thr"
+  [(set (match_operand:SF 0 "fr_register_operand" "")
+        (div:SF (match_operand:SF 1 "fr_register_operand" "")
+                (match_operand:SF 2 "fr_register_operand" "")))]
+  "TARGET_INLINE_FLOAT_DIV"
+{
+  rtx y     = gen_reg_rtx (RFmode);
+  rtx a     = gen_reg_rtx (RFmode);
+  rtx b     = gen_reg_rtx (RFmode);
+  rtx e     = gen_reg_rtx (RFmode);
+  rtx y1    = gen_reg_rtx (RFmode);
+  rtx y2    = gen_reg_rtx (RFmode);
+  rtx q     = gen_reg_rtx (RFmode);
+  rtx r     = gen_reg_rtx (RFmode);
+  rtx q_res = gen_reg_rtx (RFmode);
+  rtx cond  = gen_reg_rtx (BImode);
+  rtx zero    = CONST0_RTX (RFmode);
+  rtx one     = CONST1_RTX (RFmode);
+  rtx status0 = CONST0_RTX (SImode);
+  rtx status1 = CONST1_RTX (SImode);
+  rtx trunc_sgl = CONST0_RTX (SImode);
+  rtx trunc_off    = CONST2_RTX (SImode);
+
+  /* Empty conversions to put inputs into RFmode.  */
+  emit_insn (gen_mov_extendrfsf (a, operands[1]));
+  emit_insn (gen_mov_extendrfsf (b, operands[2]));
+  /* y = 1 / b				*/
+  emit_insn (gen_recip_approx_rf (y, a, b, cond, status0));
+  /* e = 1 - (b * y)			*/
+  emit_insn (gen_m2subrf4_cond (e, cond, one, b, y, zero, status1, trunc_off));
+  /* y1 = y + (y * e)			*/
+  emit_insn (gen_m2addrf4_cond (y1, cond, y, y, e, zero, status1, trunc_off));
+  /* y2 = y + (y1 * e)			*/
+  emit_insn (gen_m2addrf4_cond (y2, cond, y, y1, e, zero, status1, trunc_off));
+  /* q = single(a * y2)			*/
+  emit_insn (gen_mulrf3_cond (q, cond, a, y2, zero, status1, trunc_sgl));
+  /* r = a - (q * b)			*/
+  emit_insn (gen_m2subrf4_cond (r, cond, a, q, b, zero, status1, trunc_off));
+  /* Q = single (q + (r * y2))		*/
+  emit_insn (gen_m2addrf4_cond (q_res, cond, q, r, y2, y, status0, trunc_sgl));
+  /* Conversion back into SFmode.	*/
+  emit_insn (gen_mov_truncsfrf (operands[0], q_res));
+  DONE;
+})
+
+
+;; Double precision floating point division (maximum throughput algorithm).
+
+(define_expand "divdf3_internal_thr"
+  [(set (match_operand:DF 0 "fr_register_operand" "")
+        (div:DF (match_operand:DF 1 "fr_register_operand" "")
+                (match_operand:DF 2 "fr_register_operand" "")))]
+  "TARGET_INLINE_FLOAT_DIV"
+{
+  rtx q_res = gen_reg_rtx (RFmode);
+  rtx a     = gen_reg_rtx (RFmode);
+  rtx b     = gen_reg_rtx (RFmode);
+  rtx y     = gen_reg_rtx (RFmode);
+  rtx e     = gen_reg_rtx (RFmode);
+  rtx y1    = gen_reg_rtx (RFmode);
+  rtx e1    = gen_reg_rtx (RFmode);
+  rtx y2    = gen_reg_rtx (RFmode);
+  rtx e2    = gen_reg_rtx (RFmode);
+  rtx y3    = gen_reg_rtx (RFmode);
+  rtx q     = gen_reg_rtx (RFmode);
+  rtx r     = gen_reg_rtx (RFmode);
+  rtx cond  = gen_reg_rtx (BImode);
+  rtx zero    = CONST0_RTX (RFmode);
+  rtx one     = CONST1_RTX (RFmode);
+  rtx status0 = CONST0_RTX (SImode);
+  rtx status1 = CONST1_RTX (SImode);
+  rtx trunc_dbl = CONST1_RTX (SImode);
+  rtx trunc_off = CONST2_RTX (SImode);
+  /* Empty conversions to put inputs into RFmode */
+  emit_insn (gen_mov_extendrfdf (a, operands[1]));
+  emit_insn (gen_mov_extendrfdf (b, operands[2]));
+  /* y  = 1 / b			*/
+  emit_insn (gen_recip_approx_rf (y, a, b, cond, status0));
+  /* e  = 1 - (b * y)		*/
+  emit_insn (gen_m2subrf4_cond (e, cond, one, b, y, zero, status1, trunc_off));
+  /* y1 = y + (y * e)		*/
+  emit_insn (gen_m2addrf4_cond (y1, cond, y, y, e, zero, status1, trunc_off));
+  /* e1 = e * e			*/
+  emit_insn (gen_mulrf3_cond (e1, cond, e, e, zero, status1, trunc_off));
+  /* y2 = y1 + (y1 * e1)	*/
+  emit_insn (gen_m2addrf4_cond (y2, cond, y1, y1, e1, zero, status1, trunc_off));
+  /* e2 = e1 * e1		*/
+  emit_insn (gen_mulrf3_cond (e2, cond, e1, e1, zero, status1, trunc_off));
+  /* y3 = y2 + (y2 * e2)	*/
+  emit_insn (gen_m2addrf4_cond (y3, cond, y2, y2, e2, zero, status1, trunc_off));
+  /* q  = double (a * y3)	*/
+  emit_insn (gen_mulrf3_cond (q, cond, a, y3, zero, status1, trunc_dbl));
+  /* r  = a - (b * q)		*/
+  emit_insn (gen_m2subrf4_cond (r, cond, a, b, q, zero, status1, trunc_off));
+  /* Q  = double (q + (r * y3))	*/
+  emit_insn (gen_m2addrf4_cond (q_res, cond, q, r, y3, y, status0, trunc_dbl));
+  /* Conversion back into DFmode */
+  emit_insn (gen_mov_truncdfrf (operands[0], q_res));
+  DONE;
+})

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Patch to change IA64 division code
  2007-03-27 17:33   ` Steve Ellcey
@ 2007-07-03  1:41     ` Jim Wilson
  2007-07-10 21:58       ` Steve Ellcey
  0 siblings, 1 reply; 7+ messages in thread
From: Jim Wilson @ 2007-07-03  1:41 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: gcc-patches

On Tue, 2007-03-27 at 10:21 -0700, Steve Ellcey wrote:
> Here is a new version of my division change.  I got rid of the _a and _b
> variants and changed the division sequences to use gen_* calls.  This
> makes div.md smaller and easier to read.  The code sequence generated is
> unchanged.

Yes, this is looking much nicer.

I noticed that you are using 'U' to match FP constant 0 in the combined
_a/_b patterns.  However, 'U' is documented to be for a vector constant
0, and this isn't a vector constant.  So either we need to change the
comments for 'U', or we need to add a new letter.  There are two letters
reserved for FP constants, G and H, and we currently only use G.  So I
think it would be better to add a new constraint H that is for FP
constant 0.  (Or alternatively, we could modify G which is currently
both 0.0 and 1.0, and have G match 0.0 and H match 1.0, but that would
be more work.)

I should have been a bit more explicit when I asked about the
UNSPEC_NOP_CONVERT.  I don't understand why you aren't using the
standard float_extend and float_truncate operators.  You should only add
an unspec unless you have some actual need for an unspec, and I don't
see one here.  The comments you added say that you are just doing a FP
conversion, so just use the standard FP conversion RTL operators.  It is
OK to have an FP conversion that later splits into a simple move
instruction.  rs6000 for instance already has instances of that.  If use
of the standard conversion operators fails for some reason, then it
would be good to have that reason listed in the comments.  The use of an
UNSPEC unnecessarily here could result in nop move instructions that
won't get optimized away.

> I am still a bit concerned about HARD_REGNO_CALLER_SAVE_MODE.

FYI you still have the ChangeLog entry for this change, but it isn't in
this version of the patch.

You raise a good point here.  I had to look at the
HARD_REGNO_CALLER_SAVE_MODE code to remind myself how it works.  I think
it is OK to use RFmode here as you had it originally.  Since we already
have RFmode support via the __fpregs builtin type, this probably should
have been fixed earlier.
-- 
Jim Wilson, GNU Tools Support, http://www.specifix.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Patch to change IA64 division code
  2007-07-03  1:41     ` Jim Wilson
@ 2007-07-10 21:58       ` Steve Ellcey
  2007-07-25 18:13         ` Steve Ellcey
  0 siblings, 1 reply; 7+ messages in thread
From: Steve Ellcey @ 2007-07-10 21:58 UTC (permalink / raw)
  To: wilson; +Cc: gcc-patches

> I noticed that you are using 'U' to match FP constant 0 in the combined
> _a/_b patterns.  However, 'U' is documented to be for a vector constant
> 0, and this isn't a vector constant.  So either we need to change the
> comments for 'U', or we need to add a new letter.  There are two letters
> reserved for FP constants, G and H, and we currently only use G.  So I
> think it would be better to add a new constraint H that is for FP
> constant 0.  (Or alternatively, we could modify G which is currently
> both 0.0 and 1.0, and have G match 0.0 and H match 1.0, but that would
> be more work.)

OK, I added 'H' to be the FP constant 0.  I hadn't noticed that 'U' is
for vectors only.  There is a comment about this in contraints.md, and
it should probably be cleaned up, but that can be done in a seperate
patch.

> I should have been a bit more explicit when I asked about the
> UNSPEC_NOP_CONVERT.  I don't understand why you aren't using the
> standard float_extend and float_truncate operators.  You should only add
> an unspec unless you have some actual need for an unspec, and I don't
> see one here.  The comments you added say that you are just doing a FP
> conversion, so just use the standard FP conversion RTL operators.  It is
> OK to have an FP conversion that later splits into a simple move
> instruction.  rs6000 for instance already has instances of that.  If use
> of the standard conversion operators fails for some reason, then it
> would be good to have that reason listed in the comments.  The use of an
> UNSPEC unnecessarily here could result in nop move instructions that
> won't get optimized away.

OK, I used the regular conversion operator and it seems to work fine so
I made that change.

> > I am still a bit concerned about HARD_REGNO_CALLER_SAVE_MODE.
> 
> You raise a good point here.  I had to look at the
> HARD_REGNO_CALLER_SAVE_MODE code to remind myself how it works.  I think
> it is OK to use RFmode here as you had it originally.  Since we already
> have RFmode support via the __fpregs builtin type, this probably should
> have been fixed earlier.

OK, I changed this back to use RFmode.  Here is the redone patch.  OK
to checkin?

Steve Ellcey
sje@cup.hp.com


2007-07-10  Steve Ellcey  <sje@cup.hp.com>

	* config/ia64/ia64.h (HARD_REGNO_NREGS): Handle RFmode.
	(HARD_REGNO_MODE_OK): Ditto.
	(MODES_TIEABLE_P): Ditto.
	(HARD_REGNO_CALLER_SAVE_MODE): Ditto.
	(CLASS_MAX_NREGS): Ditto.
	* config/ia64/ia64.c (ia64_print_operand_address): Add R format.
	* config/ia64/ia64.md (divsf3_internal_thr): Removed.
	(divdf3_internal_thr): Removed.
	* config/ia64/div.md: New file.
	* config/ia64/constraints.md: Add H constraint.


Index: config/ia64/ia64.h
===================================================================
--- config/ia64/ia64.h	(revision 126495)
+++ config/ia64/ia64.h	(working copy)
@@ -645,6 +645,7 @@ while (0)
    : PR_REGNO_P (REGNO) && (MODE) == BImode ? 2				\
    : PR_REGNO_P (REGNO) && (MODE) == CCImode ? 1			\
    : FR_REGNO_P (REGNO) && (MODE) == XFmode ? 1				\
+   : FR_REGNO_P (REGNO) && (MODE) == RFmode ? 1				\
    : FR_REGNO_P (REGNO) && (MODE) == XCmode ? 2				\
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
@@ -660,7 +661,7 @@ while (0)
    : PR_REGNO_P (REGNO) ?					\
      (MODE) == BImode || GET_MODE_CLASS (MODE) == MODE_CC	\
    : GR_REGNO_P (REGNO) ?					\
-     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode	\
+     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode && (MODE) != RFmode \
    : AR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : BR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : 0)
@@ -677,15 +678,15 @@ while (0)
    we can't tie it with any other modes.  */
 #define MODES_TIEABLE_P(MODE1, MODE2)			\
   (GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2)	\
-   && ((((MODE1) == XFmode) || ((MODE1) == XCmode))	\
-       == (((MODE2) == XFmode) || ((MODE2) == XCmode)))	\
+   && ((((MODE1) == XFmode) || ((MODE1) == XCmode) || ((MODE1) == RFmode))	\
+       == (((MODE2) == XFmode) || ((MODE2) == XCmode) || ((MODE1) == RFmode)))	\
    && (((MODE1) == BImode) == ((MODE2) == BImode)))
 
 /* Specify the modes required to caller save a given hard regno.
    We need to ensure floating pt regs are not saved as DImode.  */
 
 #define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE) \
-  ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? XFmode        \
+  ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? RFmode        \
    : choose_hard_reg_mode ((REGNO), (NREGS), false))
 \f
 /* Handling Leaf Functions */
@@ -883,6 +884,7 @@ enum reg_class
 #define CLASS_MAX_NREGS(CLASS, MODE) \
   ((MODE) == BImode && (CLASS) == PR_REGS ? 2			\
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XFmode) ? 1 \
+   : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == RFmode) ? 1 \
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XCmode) ? 2 \
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
Index: config/ia64/ia64.c
===================================================================
--- config/ia64/ia64.c	(revision 126495)
+++ config/ia64/ia64.c	(working copy)
@@ -4496,6 +4496,7 @@ ia64_print_operand_address (FILE * strea
    O	Append .acq for volatile load.
    P	Postincrement of a MEM.
    Q	Append .rel for volatile store.
+   R	Print .s .d or nothing for a single, double or no truncation.
    S	Shift amount for shladd instruction.
    T	Print an 8-bit sign extended number (K) as a 32-bit unsigned number
 	for Intel assembler.
@@ -4634,6 +4635,17 @@ ia64_print_operand (FILE * file, rtx x, 
     case 'Q':
       if (MEM_VOLATILE_P (x))
 	fputs(".rel", file);
+      return;
+
+    case 'R':
+      if (x == CONST0_RTX (GET_MODE (x)))
+	fputs(".s", file);
+      else if (x == CONST1_RTX (GET_MODE (x)))
+	fputs(".d", file);
+      else if (x == CONST2_RTX (GET_MODE (x)))
+	;
+      else
+	output_operand_lossage ("invalid %%R value");
       return;
 
     case 'S':
Index: config/ia64/ia64.md
===================================================================
--- config/ia64/ia64.md	(revision 126495)
+++ config/ia64/ia64.md	(working copy)
@@ -3109,60 +3109,6 @@ (define_insn_and_split "divsf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divsf3_internal_thr"
-  [(set (match_operand:SF 0 "fr_register_operand" "=&f")
-	(div:SF (match_operand:SF 1 "fr_register_operand" "f")
-		(match_operand:SF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:XF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 3))
-			      (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:SF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:XF (match_dup 7)
-			       (mult:XF (match_dup 8) (match_dup 3))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (float_truncate:SF
-	    (plus:XF (mult:XF (match_dup 4) (match_dup 6))
-			      (match_dup 3)))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (SFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_insn "*sqrt_approx"
@@ -3615,72 +3561,6 @@ (define_insn_and_split "divdf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divdf3_internal_thr"
-  [(set (match_operand:DF 0 "fr_register_operand" "=&f")
-	(div:DF (match_operand:DF 1 "fr_register_operand" "f")
-		(match_operand:DF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:DF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:DF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:DF (match_dup 1)
-			       (mult:DF (match_dup 2) (match_dup 9))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (plus:DF (mult:DF (match_dup 4) (match_dup 0))
-			    (match_dup 9))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (DFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_expand "sqrtdf2"
@@ -6541,3 +6421,5 @@ (define_insn "ip_value"
 (include "vect.md")
 ;; Atomic operations
 (include "sync.md")
+;; New division operations
+(include "div.md")
Index: config/ia64/div.md
===================================================================
--- config/ia64/div.md	(revision 0)
+++ config/ia64/div.md	(revision 0)
@@ -0,0 +1,313 @@
+
+;; For the internal conditional math routines:
+
+;; operand 0 is always the result
+;; operand 1 is always the predicate
+;; operand 2, 3, and sometimes 4 are the input values.
+;; operand 4 or 5 is the floating point status register to use.
+;; operand 5 or 6 is the rounding to do. (0 = single, 1 = double, 2 = none)
+;;
+;; addrf3_cond   - F0 = F2 + F3
+;; subrf3_cond   - F0 = F2 - F3
+;; mulrf3_cond   - F0 = F2 * F3
+;; nmulrf3_cond  - F0 = - (F2 * F3)
+;; m1addrf4_cond - F0 = (F2 * F3) + F4
+;; m1subrf4_cond - F0 = (F2 * F3) - F4
+;; m2addrf4_cond - F0 = F2 + (F3 * F4)
+;; m2subrf4_cond - F0 = F2 - (F3 * F4)
+
+;; Basic plus/minus/mult operations
+
+(define_insn "addrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fadd%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "subrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fsub%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "mulrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; neg-mult operation
+
+(define_insn "nmulrf3_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (neg:RF (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG")))
+          (match_operand:RF 4 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fnmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op1)
+
+(define_insn "m1addrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (plus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "m1subrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (minus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG"))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fms%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op2)
+
+(define_insn "m2addrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG")))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "m2subrf4_cond"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c,c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fg,fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fg,fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fg,fG")))
+          (match_operand:RF 5 "fr_reg_or_0_operand" "0,H")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fnma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; Conversions to/from RF and SF/DF/XF
+;; These conversions should not generate any code but make it possible
+;; for all the instructions used to implement floating point division
+;; to be written for RFmode only and to not have to handle multiple
+;; modes or to have to handle a register in more than one mode.
+
+(define_mode_macro SDX_F [SF DF XF])
+
+(define_insn "extend<mode>rf2"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (float_extend:RF (match_operand:SDX_F 1 "fr_register_operand" "f")))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+(define_split
+  [(set (match_operand:RF 0 "fr_register_operand" "")
+        (float_extend:RF (match_operand:SDX_F 1 "fr_register_operand" "")))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (RFmode, REGNO (operands[1]));
+})
+
+
+(define_insn "truncrf<mode>2"
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "=f")
+        (float_truncate:SDX_F (match_operand:RF 1 "fr_register_operand" "f")))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+(define_split
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "")
+        (float_truncate:SDX_F (match_operand:RF 1 "fr_register_operand" "")))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+})
+
+;; Reciprical approximation
+
+(define_insn "recip_approx_rf"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (div:RF (match_operand:RF 1 "fr_register_operand" "f")
+                (match_operand:RF 2 "fr_register_operand" "f")))
+   (set (match_operand:BI 3 "register_operand" "=c")
+        (unspec:BI [(match_dup 1) (match_dup 2)] UNSPEC_FR_RECIP_APPROX))
+   (use (match_operand:SI 4 "const_int_operand" ""))]
+  ""
+  "frcpa.s%4 %0, %3 = %1, %2"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "no")])
+
+;; Single precision floating point division (maximum throughput algorithm).
+
+(define_expand "divsf3_internal_thr"
+  [(set (match_operand:SF 0 "fr_register_operand" "")
+        (div:SF (match_operand:SF 1 "fr_register_operand" "")
+                (match_operand:SF 2 "fr_register_operand" "")))]
+  "TARGET_INLINE_FLOAT_DIV"
+{
+  rtx y     = gen_reg_rtx (RFmode);
+  rtx a     = gen_reg_rtx (RFmode);
+  rtx b     = gen_reg_rtx (RFmode);
+  rtx e     = gen_reg_rtx (RFmode);
+  rtx y1    = gen_reg_rtx (RFmode);
+  rtx y2    = gen_reg_rtx (RFmode);
+  rtx q     = gen_reg_rtx (RFmode);
+  rtx r     = gen_reg_rtx (RFmode);
+  rtx q_res = gen_reg_rtx (RFmode);
+  rtx cond  = gen_reg_rtx (BImode);
+  rtx zero    = CONST0_RTX (RFmode);
+  rtx one     = CONST1_RTX (RFmode);
+  rtx status0 = CONST0_RTX (SImode);
+  rtx status1 = CONST1_RTX (SImode);
+  rtx trunc_sgl = CONST0_RTX (SImode);
+  rtx trunc_off    = CONST2_RTX (SImode);
+
+  /* Empty conversions to put inputs into RFmode.  */
+  emit_insn (gen_extendsfrf2 (a, operands[1]));
+  emit_insn (gen_extendsfrf2 (b, operands[2]));
+  /* y = 1 / b				*/
+  emit_insn (gen_recip_approx_rf (y, a, b, cond, status0));
+  /* e = 1 - (b * y)			*/
+  emit_insn (gen_m2subrf4_cond (e, cond, one, b, y, zero, status1, trunc_off));
+  /* y1 = y + (y * e)			*/
+  emit_insn (gen_m2addrf4_cond (y1, cond, y, y, e, zero, status1, trunc_off));
+  /* y2 = y + (y1 * e)			*/
+  emit_insn (gen_m2addrf4_cond (y2, cond, y, y1, e, zero, status1, trunc_off));
+  /* q = single(a * y2)			*/
+  emit_insn (gen_mulrf3_cond (q, cond, a, y2, zero, status1, trunc_sgl));
+  /* r = a - (q * b)			*/
+  emit_insn (gen_m2subrf4_cond (r, cond, a, q, b, zero, status1, trunc_off));
+  /* Q = single (q + (r * y2))		*/
+  emit_insn (gen_m2addrf4_cond (q_res, cond, q, r, y2, y, status0, trunc_sgl));
+  /* Conversion back into SFmode.	*/
+  emit_insn (gen_truncrfsf2 (operands[0], q_res));
+  DONE;
+})
+
+
+;; Double precision floating point division (maximum throughput algorithm).
+
+(define_expand "divdf3_internal_thr"
+  [(set (match_operand:DF 0 "fr_register_operand" "")
+        (div:DF (match_operand:DF 1 "fr_register_operand" "")
+                (match_operand:DF 2 "fr_register_operand" "")))]
+  "TARGET_INLINE_FLOAT_DIV"
+{
+  rtx q_res = gen_reg_rtx (RFmode);
+  rtx a     = gen_reg_rtx (RFmode);
+  rtx b     = gen_reg_rtx (RFmode);
+  rtx y     = gen_reg_rtx (RFmode);
+  rtx e     = gen_reg_rtx (RFmode);
+  rtx y1    = gen_reg_rtx (RFmode);
+  rtx e1    = gen_reg_rtx (RFmode);
+  rtx y2    = gen_reg_rtx (RFmode);
+  rtx e2    = gen_reg_rtx (RFmode);
+  rtx y3    = gen_reg_rtx (RFmode);
+  rtx q     = gen_reg_rtx (RFmode);
+  rtx r     = gen_reg_rtx (RFmode);
+  rtx cond  = gen_reg_rtx (BImode);
+  rtx zero    = CONST0_RTX (RFmode);
+  rtx one     = CONST1_RTX (RFmode);
+  rtx status0 = CONST0_RTX (SImode);
+  rtx status1 = CONST1_RTX (SImode);
+  rtx trunc_dbl = CONST1_RTX (SImode);
+  rtx trunc_off = CONST2_RTX (SImode);
+  /* Empty conversions to put inputs into RFmode */
+  emit_insn (gen_extenddfrf2 (a, operands[1]));
+  emit_insn (gen_extenddfrf2 (b, operands[2]));
+  /* y  = 1 / b			*/
+  emit_insn (gen_recip_approx_rf (y, a, b, cond, status0));
+  /* e  = 1 - (b * y)		*/
+  emit_insn (gen_m2subrf4_cond (e, cond, one, b, y, zero, status1, trunc_off));
+  /* y1 = y + (y * e)		*/
+  emit_insn (gen_m2addrf4_cond (y1, cond, y, y, e, zero, status1, trunc_off));
+  /* e1 = e * e			*/
+  emit_insn (gen_mulrf3_cond (e1, cond, e, e, zero, status1, trunc_off));
+  /* y2 = y1 + (y1 * e1)	*/
+  emit_insn (gen_m2addrf4_cond (y2, cond, y1, y1, e1, zero, status1, trunc_off));
+  /* e2 = e1 * e1		*/
+  emit_insn (gen_mulrf3_cond (e2, cond, e1, e1, zero, status1, trunc_off));
+  /* y3 = y2 + (y2 * e2)	*/
+  emit_insn (gen_m2addrf4_cond (y3, cond, y2, y2, e2, zero, status1, trunc_off));
+  /* q  = double (a * y3)	*/
+  emit_insn (gen_mulrf3_cond (q, cond, a, y3, zero, status1, trunc_dbl));
+  /* r  = a - (b * q)		*/
+  emit_insn (gen_m2subrf4_cond (r, cond, a, b, q, zero, status1, trunc_off));
+  /* Q  = double (q + (r * y3))	*/
+  emit_insn (gen_m2addrf4_cond (q_res, cond, q, r, y3, y, status0, trunc_dbl));
+  /* Conversion back into DFmode */
+  emit_insn (gen_truncrfdf2 (operands[0], q_res));
+  DONE;
+})
Index: config/ia64/constraints.md
===================================================================
--- config/ia64/constraints.md	(revision 126495)
+++ config/ia64/constraints.md	(working copy)
@@ -90,6 +90,11 @@ (define_constraint "G"
   (and (match_code "const_double")
        (match_test "op == CONST0_RTX (mode) || op == CONST1_RTX (mode)")))
 
+(define_constraint "H"
+  "0.0"
+  (and (match_code "const_double")
+       (match_test "op == CONST0_RTX (mode)")))
+
 ;; Extra constraints
 
 ;; Note that while this accepts mem, it only accepts non-volatile mem,

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Patch to change IA64 division code
  2007-07-10 21:58       ` Steve Ellcey
@ 2007-07-25 18:13         ` Steve Ellcey
  0 siblings, 0 replies; 7+ messages in thread
From: Steve Ellcey @ 2007-07-25 18:13 UTC (permalink / raw)
  To: gcc-patches, wilson

> 2007-07-10  Steve Ellcey  <sje@cup.hp.com>
> 
>         * config/ia64/ia64.h (HARD_REGNO_NREGS): Handle RFmode.
>         (HARD_REGNO_MODE_OK): Ditto.
>         (MODES_TIEABLE_P): Ditto.
>         (HARD_REGNO_CALLER_SAVE_MODE): Ditto.
>         (CLASS_MAX_NREGS): Ditto.
>         * config/ia64/ia64.c (ia64_print_operand_address): Add R format.
>         * config/ia64/ia64.md (divsf3_internal_thr): Removed.
>         (divdf3_internal_thr): Removed.
>         * config/ia64/div.md: New file.
>         * config/ia64/constraints.md: Add H constraint.

FYI:  Jim Wilson told me this patch was OK when I saw him at the GCC
Summit so I am going to go ahead and check it in.

Steve Ellcey
sje@cup.hp.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-07-25 17:43 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-21 21:37 Patch to change IA64 division code Steve Ellcey
2007-03-20  3:33 ` Jim Wilson
2007-03-20 17:12   ` Steve Ellcey
2007-03-27 17:33   ` Steve Ellcey
2007-07-03  1:41     ` Jim Wilson
2007-07-10 21:58       ` Steve Ellcey
2007-07-25 18:13         ` Steve Ellcey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).