From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <meissner@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1005)
 id D6550385802C; Wed,  2 Mar 2022 18:12:52 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D6550385802C
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Michael Meissner <meissner@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/users/meissner/heads/work079)] Optimize multiply/add of
 DImode extended to TImode.
X-Act-Checkin: gcc
X-Git-Author: Michael Meissner <meissner@linux.ibm.com>
X-Git-Refname: refs/users/meissner/heads/work079
X-Git-Oldrev: a18d3c65db16ed501a872c1fd99455bad33f732d
X-Git-Newrev: 0696a98ba3b6ef146e5441670818a586e394cbe7
Message-Id: <20220302181252.D6550385802C@sourceware.org>
Date: Wed,  2 Mar 2022 18:12:52 +0000 (GMT)
X-BeenThere: gcc-cvs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-cvs mailing list <gcc-cvs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-cvs/>
List-Help: <mailto:gcc-cvs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Wed, 02 Mar 2022 18:12:52 -0000

https://gcc.gnu.org/g:0696a98ba3b6ef146e5441670818a586e394cbe7

commit 0696a98ba3b6ef146e5441670818a586e394cbe7
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Tue Mar 1 01:33:51 2022 -0500

    Optimize multiply/add of DImode extended to TImode.
    
    On power9 and power10 systems, we have instructions that support doing
    64-bit integers converted to 128-bit integers and producing 128-bit
    results.  This patch adds support to generate these instructions.
    
    Previously we had define_expands to handle conversion of the 64-bit extend
    to 128-bit and multiply.  This patch changes these define_expands to
    define_insn_and_split and then it provides combiner patterns to generate
    thes multiply/add instructions.
    
    2022-03-01   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
            PR target/103109
            * config/rs6000/rs6000.md (su_int32): New code attribute.
            (<u>mul<mode><dmode>3): Convert into define_insn_and_split.
            (maddld<mode>4): Add generator function.
            (<u>mulditi3_<u>adddi3): New insn.
            (<u>mulditi3_add_const): New insn.
            (addti3): Convert into define_insn_and_split.
            (subti3): Likewise.

Diff:
---
 gcc/config/rs6000/rs6000.md | 160 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 143 insertions(+), 17 deletions(-)
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index fdfbc6566a5..b5fc1855c35 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -676,6 +676,9 @@
 		       (float		"")
 		       (unsigned_float	"uns")])
 
+(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand")
+			    (zero_extend "c32bit_cint_operand")])
+
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI    "b")
@@ -3199,13 +3202,16 @@
   "mulhw<u> %0,%1,%2"
   [(set_attr "type" "mul")])
 
-(define_expand "<u>mul<mode><dmode>3"
-  [(set (match_operand:<DMODE> 0 "gpc_reg_operand")
+(define_insn_and_split "<u>mul<mode><dmode>3"
+  [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r")
 	(mult:<DMODE> (any_extend:<DMODE>
-			(match_operand:GPR 1 "gpc_reg_operand"))
+		       (match_operand:GPR 1 "gpc_reg_operand" "r"))
 		      (any_extend:<DMODE>
-			(match_operand:GPR 2 "gpc_reg_operand"))))]
+		       (match_operand:GPR 2 "gpc_reg_operand" "r"))))]
   "!(<MODE>mode == SImode && TARGET_POWERPC64)"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx l = gen_reg_rtx (<MODE>mode);
   rtx h = gen_reg_rtx (<MODE>mode);
@@ -3214,9 +3220,10 @@
   emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l);
   emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h);
   DONE;
-})
+}
+  [(set_attr "length" "8")])
 
-(define_insn "*maddld<mode>4"
+(define_insn "maddld<mode>4"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
 	(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
 			    (match_operand:GPR 2 "gpc_reg_operand" "r"))
@@ -3225,6 +3232,113 @@
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])
 
+(define_insn_and_split "*<u>mulditi3_<u>adddi3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize 128-bit multiply with zero/sign extend and adding a constant.  We
+;; force the constant into a register to generate li, maddhd, and maddld,
+;; instead of mulld, mulhd, addic, and addze.  We can't combine this pattern
+;; with the pattern that handles registers, since constants don't have a sign
+;; or zero extend around them.
+(define_insn_and_split "*<u>mulditi3_add_const"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (match_operand 3 "<su_int32>" "r")))]
+  "TARGET_MADDLD && TARGET_POWERPC64
+"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = force_reg (DImode, operands[3]);
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn "<u>mulditi3_<u>adddi3_upper"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(truncate:DI
+	 (lshiftrt:TI
+	  (plus:TI
+	   (mult:TI
+	    (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	    (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	   (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+	  (const_int 64))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "maddhd<u> %0,%1,%2,%3"
+  [(set_attr "type" "mul")
+   (set_attr "size" "64")])
+
 (define_insn "udiv<mode>3"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
         (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
@@ -7029,12 +7143,19 @@
 ;; allocator from allocating registers that overlap with the inputs
 ;; (for example, having an input in 7,8 and an output in 6,7).  We
 ;; also allow for the output being the same as one of the inputs.
-
-(define_expand "addti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(plus:TI (match_operand:TI 1 "gpc_reg_operand")
-		 (match_operand:TI 2 "reg_or_short_operand")))]
+;;
+;; Addti3/subti3 are define_insn_and_splits instead of define_expand, to allow
+;; for combine to make things like multiply and add with extend operations.
+
+(define_insn_and_split "addti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(plus:TI (match_operand:TI 1 "gpc_reg_operand" "r,0,r")
+		 (match_operand:TI 2 "reg_or_short_operand" "rn,r,0")))
+   (clobber (reg:DI CA_REGNO))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7051,13 +7172,17 @@
   emit_insn (gen_adddi3_carry (lo0, lo1, lo2));
   emit_insn (gen_adddi3_carry_in (hi0, hi1, hi2));
   DONE;
-})
+}
+  [(set_attr "length" "8")])
 
-(define_expand "subti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(minus:TI (match_operand:TI 1 "reg_or_short_operand")
-		  (match_operand:TI 2 "gpc_reg_operand")))]
+(define_insn_and_split "subti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(minus:TI (match_operand:TI 1 "reg_or_short_operand" "rn,0,r")
+		  (match_operand:TI 2 "gpc_reg_operand" "r,r,0")))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7074,7 +7199,8 @@
   emit_insn (gen_subfdi3_carry (lo0, lo2, lo1));
   emit_insn (gen_subfdi3_carry_in (hi0, hi2, hi1));
   DONE;
-})
+}
+  [(set_attr "length" "8")])
 
 ;; 128-bit logical operations expanders