[gcc(refs/users/meissner/heads/work080)] Optimize multiply/add of DImode extended to TImode.

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc(refs/users/meissner/heads/work080)] Optimize multiply/add of DImode extended to TImode.
@ 2022-03-07 23:43 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2022-03-07 23:43 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:6fd9ec9b36afa474f6e5aec63e2f4fa98d4365ca

commit 6fd9ec9b36afa474f6e5aec63e2f4fa98d4365ca
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Mon Mar 7 18:43:26 2022 -0500

    Optimize multiply/add of DImode extended to TImode.
    
    On power9 and power10 systems, we have instructions that support doing
    64-bit integers converted to 128-bit integers and producing 128-bit
    results.  This patch adds support to generate these instructions.
    
    Previously we had define_expands to handle conversion of the 64-bit extend
    to 128-bit and multiply.  This patch changes these define_expands to
    define_insn_and_split and then it provides combiner patterns to generate
    thes multiply/add instructions.
    
    To support using this optimization on power9, we extend the sign extend
    DImode to TImode to also run on power9 (added for PR target/104698).
    
    We add support for doing an unsigned DImode to TImode conversion.  We need
    these conversions to exist on power9 so that the combiner can properly
    combine the extend, multiply, and add instructions.
    
    2022-03-07   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
            PR target/103109
            * config/rs6000/rs6000.md (su_int32): New code attribute.
            (<u>mul<mode><dmode>3): Convert from define_expand to
            define_insn_and_split.
            (maddld<mode>4): Add generator function.
            (<u>mulditi3_<u>adddi3): New insn.
            (<u>mulditi3_add_const): New insn.
            (<u>mulditi3_<u>adddi3_upper): New insn.
            (addti3): Convert from define_expand to define_insn_and_split.
            (subti3): Likewise.
            * config/rs6000/vsx.md (extendditi2): Allow on power9 systems.
            Add isa attribute for the stuff that needs power10 support.
            (zero_extendditi2): New insn.
    
    gcc/testsuite/
            PR target/103109
            * gcc.target/powerpc/pr103109.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000.md                 | 166 +++++++++++++++++++++++++---
 gcc/testsuite/gcc.target/powerpc/pr103109.c |  62 +++++++++++
 2 files changed, 211 insertions(+), 17 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index fdfbc6566a5..da7367ee642 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -676,6 +676,9 @@
 		       (float		"")
 		       (unsigned_float	"uns")])
 
+(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand")
+			    (zero_extend "c32bit_cint_operand")])
+
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI    "b")
@@ -3199,13 +3202,16 @@
   "mulhw<u> %0,%1,%2"
   [(set_attr "type" "mul")])
 
-(define_expand "<u>mul<mode><dmode>3"
-  [(set (match_operand:<DMODE> 0 "gpc_reg_operand")
+(define_insn_and_split "<u>mul<mode><dmode>3"
+  [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r")
 	(mult:<DMODE> (any_extend:<DMODE>
-			(match_operand:GPR 1 "gpc_reg_operand"))
+		       (match_operand:GPR 1 "gpc_reg_operand" "r"))
 		      (any_extend:<DMODE>
-			(match_operand:GPR 2 "gpc_reg_operand"))))]
+		       (match_operand:GPR 2 "gpc_reg_operand" "r"))))]
   "!(<MODE>mode == SImode && TARGET_POWERPC64)"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx l = gen_reg_rtx (<MODE>mode);
   rtx h = gen_reg_rtx (<MODE>mode);
@@ -3214,9 +3220,10 @@
   emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l);
   emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h);
   DONE;
-})
+}
+  [(set_attr "length" "8")])
 
-(define_insn "*maddld<mode>4"
+(define_insn "maddld<mode>4"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
 	(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
 			    (match_operand:GPR 2 "gpc_reg_operand" "r"))
@@ -3225,6 +3232,115 @@
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])
 
+(define_insn_and_split "*<u>mulditi3_<u>adddi3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize 128-bit multiply with zero/sign extend and adding a constant.  We
+;; force the constant into a register to generate li, maddhd, and maddld,
+;; instead of mulld, mulhd, addic, and addze.  We can't combine this pattern
+;; with the pattern that handles registers, since constants don't have a sign
+;; or zero extend around them.
+(define_insn_and_split "*<u>mulditi3_add_const"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (match_operand 3 "<su_int32>" "r")))]
+  "TARGET_MADDLD && TARGET_POWERPC64
+"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = force_reg (DImode, operands[3]);
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "mul")
+   (set_attr "size" "64")])
+
+(define_insn "<u>mulditi3_<u>adddi3_upper"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(truncate:DI
+	 (lshiftrt:TI
+	  (plus:TI
+	   (mult:TI
+	    (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	    (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	   (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+	  (const_int 64))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "maddhd<u> %0,%1,%2,%3"
+  [(set_attr "type" "mul")
+   (set_attr "size" "64")])
+
 (define_insn "udiv<mode>3"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
         (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
@@ -7029,12 +7145,19 @@
 ;; allocator from allocating registers that overlap with the inputs
 ;; (for example, having an input in 7,8 and an output in 6,7).  We
 ;; also allow for the output being the same as one of the inputs.
-
-(define_expand "addti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(plus:TI (match_operand:TI 1 "gpc_reg_operand")
-		 (match_operand:TI 2 "reg_or_short_operand")))]
+;;
+;; Addti3/subti3 are define_insn_and_splits instead of define_expand, to allow
+;; for combine to make things like multiply and add with extend operations.
+
+(define_insn_and_split "addti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(plus:TI (match_operand:TI 1 "gpc_reg_operand" "r,0,r")
+		 (match_operand:TI 2 "reg_or_short_operand" "rn,r,0")))
+   (clobber (reg:DI CA_REGNO))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7051,13 +7174,19 @@
   emit_insn (gen_adddi3_carry (lo0, lo1, lo2));
   emit_insn (gen_adddi3_carry_in (hi0, hi1, hi2));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 
-(define_expand "subti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(minus:TI (match_operand:TI 1 "reg_or_short_operand")
-		  (match_operand:TI 2 "gpc_reg_operand")))]
+(define_insn_and_split "subti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(minus:TI (match_operand:TI 1 "reg_or_short_operand" "rn,0,r")
+		  (match_operand:TI 2 "gpc_reg_operand" "r,r,0")))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7074,7 +7203,10 @@
   emit_insn (gen_subfdi3_carry (lo0, lo2, lo1));
   emit_insn (gen_subfdi3_carry_in (hi0, hi2, hi1));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 \f
 ;; 128-bit logical operations expanders
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c b/gcc/testsuite/gcc.target/powerpc/pr103109.c
new file mode 100644
index 00000000000..c1a807532d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c
@@ -0,0 +1,62 @@
+/* { dg-require-effective-target int128     } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+
+/* This test makes sure that GCC generates the maddhd, maddhdu, and maddld
+   power9 instructions when doing some forms of 64-bit integers converted to
+   128-bit integers and used with multiply/add operations.  */
+
+__int128_t
+s_mult_add (long long a,
+	    long long b,
+	    long long c)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) + (__int128_t)c;
+}
+
+/* Test 32-bit constants that are loaded into GPRs instead of doing the
+   mulld/mulhd and then addic/addime or addc/addze.  */
+__int128_t
+s_mult_add_m10 (long long a,
+		long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) - 10;
+}
+
+__int128_t
+s_mult_add_70000 (long long a,
+		  long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) + 70000;
+}
+
+__uint128_t
+u_mult_add (unsigned long long a,
+	    unsigned long long b,
+	    unsigned long long c)
+{
+  /* maddhd, maddld.  */
+  return ((__uint128_t)a * (__uint128_t)b) + (__uint128_t)c;
+}
+
+__uint128_t
+u_mult_add_0x80000000 (unsigned long long a,
+		       unsigned long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__uint128_t)a * (__uint128_t)b) + 0x80000000UL;
+}
+
+/* { dg-final { scan-assembler-not   {\maddc\M}     } } */
+/* { dg-final { scan-assembler-not   {\madde\M}     } } */
+/* { dg-final { scan-assembler-not   {\maddid\M}    } } */
+/* { dg-final { scan-assembler-not   {\maddme\M}    } } */
+/* { dg-final { scan-assembler-not   {\maddze\M}    } } */
+/* { dg-final { scan-assembler-not   {\mmulhd\M}    } } */
+/* { dg-final { scan-assembler-not   {\mmulld\M}    } } */
+/* { dg-final { scan-assembler-times {\mmaddhd\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mmaddld\M} 3 } } */
+


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/meissner/heads/work080)] Optimize multiply/add of DImode extended to TImode.
@ 2022-03-08  7:07 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2022-03-08  7:07 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:2e8d0c32a917f6be8374eb4bda0a160933943a60

commit 2e8d0c32a917f6be8374eb4bda0a160933943a60
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Tue Mar 8 02:06:56 2022 -0500

    Optimize multiply/add of DImode extended to TImode.
    
    On power9 and power10 systems, we have instructions that support doing
    64-bit integers converted to 128-bit integers and producing 128-bit
    results.  This patch adds support to generate these instructions.
    
    Previously we had define_expands to handle conversion of the 64-bit extend
    to 128-bit and multiply.  This patch changes these define_expands to
    define_insn_and_split and then it provides combiner patterns to generate
    thes multiply/add instructions.
    
    To support using this optimization on power9, we extend the sign extend
    DImode to TImode to also run on power9 (added for PR target/104698).
    
    We add support for doing an unsigned DImode to TImode conversion.  We need
    these conversions to exist on power9 so that the combiner can properly
    combine the extend, multiply, and add instructions.
    
    2022-03-08   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
            PR target/103109
            * config/rs6000/rs6000.md (su_int32): New code attribute.
            (<u>mul<mode><dmode>3): Convert from define_expand to
            define_insn_and_split.
            (maddld<mode>4): Add generator function.
            (<u>mulditi3_<u>adddi3): New insn.
            (<u>mulditi3_add_const): New insn.
            (<u>mulditi3_<u>adddi3_upper): New insn.
            (addti3): Convert from define_expand to define_insn_and_split.
            (subti3): Likewise.
            * config/rs6000/vsx.md (extendditi2): Allow on power9 systems.
            Add isa attribute for the stuff that needs power10 support.
            (zero_extendditi2): New insn.
    
    gcc/testsuite/
            PR target/103109
            * gcc.target/powerpc/pr103109.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000.md                 | 166 +++++++++++++++++++++++++---
 gcc/config/rs6000/vsx.md                    |  60 +++++++++-
 gcc/testsuite/gcc.target/powerpc/pr103109.c |  62 +++++++++++
 3 files changed, 265 insertions(+), 23 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index fdfbc6566a5..da7367ee642 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -676,6 +676,9 @@
 		       (float		"")
 		       (unsigned_float	"uns")])
 
+(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand")
+			    (zero_extend "c32bit_cint_operand")])
+
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI    "b")
@@ -3199,13 +3202,16 @@
   "mulhw<u> %0,%1,%2"
   [(set_attr "type" "mul")])
 
-(define_expand "<u>mul<mode><dmode>3"
-  [(set (match_operand:<DMODE> 0 "gpc_reg_operand")
+(define_insn_and_split "<u>mul<mode><dmode>3"
+  [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r")
 	(mult:<DMODE> (any_extend:<DMODE>
-			(match_operand:GPR 1 "gpc_reg_operand"))
+		       (match_operand:GPR 1 "gpc_reg_operand" "r"))
 		      (any_extend:<DMODE>
-			(match_operand:GPR 2 "gpc_reg_operand"))))]
+		       (match_operand:GPR 2 "gpc_reg_operand" "r"))))]
   "!(<MODE>mode == SImode && TARGET_POWERPC64)"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx l = gen_reg_rtx (<MODE>mode);
   rtx h = gen_reg_rtx (<MODE>mode);
@@ -3214,9 +3220,10 @@
   emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l);
   emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h);
   DONE;
-})
+}
+  [(set_attr "length" "8")])
 
-(define_insn "*maddld<mode>4"
+(define_insn "maddld<mode>4"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
 	(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
 			    (match_operand:GPR 2 "gpc_reg_operand" "r"))
@@ -3225,6 +3232,115 @@
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])
 
+(define_insn_and_split "*<u>mulditi3_<u>adddi3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize 128-bit multiply with zero/sign extend and adding a constant.  We
+;; force the constant into a register to generate li, maddhd, and maddld,
+;; instead of mulld, mulhd, addic, and addze.  We can't combine this pattern
+;; with the pattern that handles registers, since constants don't have a sign
+;; or zero extend around them.
+(define_insn_and_split "*<u>mulditi3_add_const"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (match_operand 3 "<su_int32>" "r")))]
+  "TARGET_MADDLD && TARGET_POWERPC64
+"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = force_reg (DImode, operands[3]);
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "mul")
+   (set_attr "size" "64")])
+
+(define_insn "<u>mulditi3_<u>adddi3_upper"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(truncate:DI
+	 (lshiftrt:TI
+	  (plus:TI
+	   (mult:TI
+	    (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	    (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	   (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+	  (const_int 64))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "maddhd<u> %0,%1,%2,%3"
+  [(set_attr "type" "mul")
+   (set_attr "size" "64")])
+
 (define_insn "udiv<mode>3"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
         (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
@@ -7029,12 +7145,19 @@
 ;; allocator from allocating registers that overlap with the inputs
 ;; (for example, having an input in 7,8 and an output in 6,7).  We
 ;; also allow for the output being the same as one of the inputs.
-
-(define_expand "addti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(plus:TI (match_operand:TI 1 "gpc_reg_operand")
-		 (match_operand:TI 2 "reg_or_short_operand")))]
+;;
+;; Addti3/subti3 are define_insn_and_splits instead of define_expand, to allow
+;; for combine to make things like multiply and add with extend operations.
+
+(define_insn_and_split "addti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(plus:TI (match_operand:TI 1 "gpc_reg_operand" "r,0,r")
+		 (match_operand:TI 2 "reg_or_short_operand" "rn,r,0")))
+   (clobber (reg:DI CA_REGNO))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7051,13 +7174,19 @@
   emit_insn (gen_adddi3_carry (lo0, lo1, lo2));
   emit_insn (gen_adddi3_carry_in (hi0, hi1, hi2));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 
-(define_expand "subti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(minus:TI (match_operand:TI 1 "reg_or_short_operand")
-		  (match_operand:TI 2 "gpc_reg_operand")))]
+(define_insn_and_split "subti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(minus:TI (match_operand:TI 1 "reg_or_short_operand" "rn,0,r")
+		  (match_operand:TI 2 "gpc_reg_operand" "r,r,0")))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7074,7 +7203,10 @@
   emit_insn (gen_subfdi3_carry (lo0, lo2, lo1));
   emit_insn (gen_subfdi3_carry_in (hi0, hi2, hi1));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 \f
 ;; 128-bit logical operations expanders
 
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index d0fb92f5985..0d254de5aa6 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -5031,11 +5031,14 @@
 ;;
 ;; If the register allocator prefers to use Altivec registers on power10,
 ;; generate the vextsd2q instruction.
+;;
+;; We also need the GPR code for power9 so that we can optimize to use the
+;; multiply-add instructions.
 (define_insn_and_split "extendditi2"
   [(set (match_operand:TI 0 "register_operand" "=r,r,v,v,v")
 	(sign_extend:TI (match_operand:DI 1 "input_operand" "r,m,r,wa,Z")))
    (clobber (reg:DI CA_REGNO))]
-  "TARGET_POWERPC64 && TARGET_POWER10"
+  "TARGET_POWERPC64 && TARGET_MADDLD"
   "#"
   "&& reload_completed"
   [(pc)]
@@ -5052,10 +5055,7 @@
       rtx dest_lo = gen_lowpart (DImode, dest);
 
       emit_move_insn (dest_lo, src);
-      /* In case src is a MEM, we have to use the destination, which is a
-         register, instead of re-using the source.  */
-      rtx src2 = (REG_P (src) || SUBREG_P (src)) ? src : dest_lo;
-      emit_insn (gen_ashrdi3 (dest_hi, src2, GEN_INT (63)));
+      emit_insn (gen_ashrdi3 (dest_hi, dest_lo, GEN_INT (63)));
       DONE;
     }
 
@@ -5082,7 +5082,7 @@
     gcc_unreachable ();
 }
   [(set_attr "length" "8")
-   (set_attr "type" "shift,load,vecmove,vecperm,load")])
+   (set_attr "isa" "p9,p9,p10,p10,p10")])
 
 ;; Sign extend 64-bit value in TI reg, word 1, to 128-bit value in TI reg
 (define_insn "extendditi2_vector"
@@ -5093,6 +5093,54 @@
   "vextsd2q %0,%1"
   [(set_attr "type" "vecexts")])
 
+;; Zero extend DImode to TImode when the result is in GPRs or VSX registers.
+(define_insn_and_split "zero_extendditi2"
+  [(set (match_operand:TI 0 "register_operand" "=r,r,wa,wa,wa")
+	(zero_extend:TI (match_operand:DI 1 "input_operand" "r,m,r,wa,Z")))
+   (clobber (match_scratch:DI 2 "=X,X,r,wa,X"))]
+  "TARGET_POWERPC64 && TARGET_MADDLD"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx src = operands[1];
+  rtx tmp = operands[2];
+  int dest_regno = reg_or_subregno (dest);
+
+  /* Handle conversion to GPR registers.  Load up the low part and then load
+     0 to clear the upper part.  */
+  if (INT_REGNO_P (dest_regno))
+    {
+      rtx dest_hi = gen_highpart (DImode, dest);
+      rtx dest_lo = gen_lowpart (DImode, dest);
+
+      emit_move_insn (dest_lo, src);
+      emit_move_insn (dest_hi, const0_rtx);
+      DONE;
+    }
+
+  /* For conversion to a VSX register, generate either a load rightmost
+     double word instruction, or do a CONCAT operation with the upper word
+     set to 0.  */
+  else if (VSX_REGNO_P (dest_regno))
+    {
+      if (MEM_P (src))
+	emit_insn (gen_vsx_lxvrdx (dest, src));
+      else
+	{
+	  rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno);
+          emit_move_insn (tmp, const0_rtx);
+	  emit_insn (gen_vsx_concat_v2di (dest_v2di, tmp, src));
+	}
+      DONE;
+    }
+
+  else
+    gcc_unreachable ();
+}
+  [(set_attr "length" "8")
+   (set_attr "isa" "p9,p9,p9v,p9v,p10")])
 \f
 ;; ISA 3.0 Binary Floating-Point Support
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c b/gcc/testsuite/gcc.target/powerpc/pr103109.c
new file mode 100644
index 00000000000..7f67816edda
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c
@@ -0,0 +1,62 @@
+/* { dg-require-effective-target int128     } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* This test makes sure that GCC generates the maddhd, maddhdu, and maddld
+   power9 instructions when doing some forms of 64-bit integers converted to
+   128-bit integers and used with multiply/add operations.  */
+
+__int128_t
+s_mult_add (long long a,
+	    long long b,
+	    long long c)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) + (__int128_t)c;
+}
+
+/* Test 32-bit constants that are loaded into GPRs instead of doing the
+   mulld/mulhd and then addic/addime or addc/addze.  */
+__int128_t
+s_mult_add_m10 (long long a,
+		long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) - 10;
+}
+
+__int128_t
+s_mult_add_70000 (long long a,
+		  long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) + 70000;
+}
+
+__uint128_t
+u_mult_add (unsigned long long a,
+	    unsigned long long b,
+	    unsigned long long c)
+{
+  /* maddhd, maddld.  */
+  return ((__uint128_t)a * (__uint128_t)b) + (__uint128_t)c;
+}
+
+__uint128_t
+u_mult_add_0x80000000 (unsigned long long a,
+		       unsigned long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__uint128_t)a * (__uint128_t)b) + 0x80000000UL;
+}
+
+/* { dg-final { scan-assembler-not   {\maddc\M}     } } */
+/* { dg-final { scan-assembler-not   {\madde\M}     } } */
+/* { dg-final { scan-assembler-not   {\maddid\M}    } } */
+/* { dg-final { scan-assembler-not   {\maddme\M}    } } */
+/* { dg-final { scan-assembler-not   {\maddze\M}    } } */
+/* { dg-final { scan-assembler-not   {\mmulhd\M}    } } */
+/* { dg-final { scan-assembler-not   {\mmulld\M}    } } */
+/* { dg-final { scan-assembler-times {\mmaddhd\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mmaddld\M} 3 } } */
+


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/meissner/heads/work080)] Optimize multiply/add of DImode extended to TImode.
@ 2022-03-08  3:59 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2022-03-08  3:59 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:4a3e2f334d3e8f7a4663c55c8031153731cf76ae

commit 4a3e2f334d3e8f7a4663c55c8031153731cf76ae
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Mon Mar 7 22:58:51 2022 -0500

    Optimize multiply/add of DImode extended to TImode.
    
    On power9 and power10 systems, we have instructions that support doing
    64-bit integers converted to 128-bit integers and producing 128-bit
    results.  This patch adds support to generate these instructions.
    
    Previously we had define_expands to handle conversion of the 64-bit extend
    to 128-bit and multiply.  This patch changes these define_expands to
    define_insn_and_split and then it provides combiner patterns to generate
    thes multiply/add instructions.
    
    To support using this optimization on power9, we extend the sign extend
    DImode to TImode to also run on power9 (added for PR target/104698).
    
    We add support for doing an unsigned DImode to TImode conversion.  We need
    these conversions to exist on power9 so that the combiner can properly
    combine the extend, multiply, and add instructions.
    
    2022-03-07   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
            PR target/103109
            * config/rs6000/rs6000.md (su_int32): New code attribute.
            (<u>mul<mode><dmode>3): Convert from define_expand to
            define_insn_and_split.
            (maddld<mode>4): Add generator function.
            (<u>mulditi3_<u>adddi3): New insn.
            (<u>mulditi3_add_const): New insn.
            (<u>mulditi3_<u>adddi3_upper): New insn.
            (addti3): Convert from define_expand to define_insn_and_split.
            (subti3): Likewise.
            * config/rs6000/vsx.md (extendditi2): Allow on power9 systems.
            Add isa attribute for the stuff that needs power10 support.
            (zero_extendditi2): New insn.
    
    gcc/testsuite/
            PR target/103109
            * gcc.target/powerpc/pr103109.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000.md                 | 166 +++++++++++++++++++++++++---
 gcc/testsuite/gcc.target/powerpc/pr103109.c |  62 +++++++++++
 2 files changed, 211 insertions(+), 17 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index fdfbc6566a5..da7367ee642 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -676,6 +676,9 @@
 		       (float		"")
 		       (unsigned_float	"uns")])
 
+(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand")
+			    (zero_extend "c32bit_cint_operand")])
+
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI    "b")
@@ -3199,13 +3202,16 @@
   "mulhw<u> %0,%1,%2"
   [(set_attr "type" "mul")])
 
-(define_expand "<u>mul<mode><dmode>3"
-  [(set (match_operand:<DMODE> 0 "gpc_reg_operand")
+(define_insn_and_split "<u>mul<mode><dmode>3"
+  [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r")
 	(mult:<DMODE> (any_extend:<DMODE>
-			(match_operand:GPR 1 "gpc_reg_operand"))
+		       (match_operand:GPR 1 "gpc_reg_operand" "r"))
 		      (any_extend:<DMODE>
-			(match_operand:GPR 2 "gpc_reg_operand"))))]
+		       (match_operand:GPR 2 "gpc_reg_operand" "r"))))]
   "!(<MODE>mode == SImode && TARGET_POWERPC64)"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx l = gen_reg_rtx (<MODE>mode);
   rtx h = gen_reg_rtx (<MODE>mode);
@@ -3214,9 +3220,10 @@
   emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l);
   emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h);
   DONE;
-})
+}
+  [(set_attr "length" "8")])
 
-(define_insn "*maddld<mode>4"
+(define_insn "maddld<mode>4"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
 	(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
 			    (match_operand:GPR 2 "gpc_reg_operand" "r"))
@@ -3225,6 +3232,115 @@
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])
 
+(define_insn_and_split "*<u>mulditi3_<u>adddi3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize 128-bit multiply with zero/sign extend and adding a constant.  We
+;; force the constant into a register to generate li, maddhd, and maddld,
+;; instead of mulld, mulhd, addic, and addze.  We can't combine this pattern
+;; with the pattern that handles registers, since constants don't have a sign
+;; or zero extend around them.
+(define_insn_and_split "*<u>mulditi3_add_const"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (match_operand 3 "<su_int32>" "r")))]
+  "TARGET_MADDLD && TARGET_POWERPC64
+"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = force_reg (DImode, operands[3]);
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "mul")
+   (set_attr "size" "64")])
+
+(define_insn "<u>mulditi3_<u>adddi3_upper"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(truncate:DI
+	 (lshiftrt:TI
+	  (plus:TI
+	   (mult:TI
+	    (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	    (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	   (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+	  (const_int 64))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "maddhd<u> %0,%1,%2,%3"
+  [(set_attr "type" "mul")
+   (set_attr "size" "64")])
+
 (define_insn "udiv<mode>3"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
         (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
@@ -7029,12 +7145,19 @@
 ;; allocator from allocating registers that overlap with the inputs
 ;; (for example, having an input in 7,8 and an output in 6,7).  We
 ;; also allow for the output being the same as one of the inputs.
-
-(define_expand "addti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(plus:TI (match_operand:TI 1 "gpc_reg_operand")
-		 (match_operand:TI 2 "reg_or_short_operand")))]
+;;
+;; Addti3/subti3 are define_insn_and_splits instead of define_expand, to allow
+;; for combine to make things like multiply and add with extend operations.
+
+(define_insn_and_split "addti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(plus:TI (match_operand:TI 1 "gpc_reg_operand" "r,0,r")
+		 (match_operand:TI 2 "reg_or_short_operand" "rn,r,0")))
+   (clobber (reg:DI CA_REGNO))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7051,13 +7174,19 @@
   emit_insn (gen_adddi3_carry (lo0, lo1, lo2));
   emit_insn (gen_adddi3_carry_in (hi0, hi1, hi2));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 
-(define_expand "subti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(minus:TI (match_operand:TI 1 "reg_or_short_operand")
-		  (match_operand:TI 2 "gpc_reg_operand")))]
+(define_insn_and_split "subti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(minus:TI (match_operand:TI 1 "reg_or_short_operand" "rn,0,r")
+		  (match_operand:TI 2 "gpc_reg_operand" "r,r,0")))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7074,7 +7203,10 @@
   emit_insn (gen_subfdi3_carry (lo0, lo2, lo1));
   emit_insn (gen_subfdi3_carry_in (hi0, hi2, hi1));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 \f
 ;; 128-bit logical operations expanders
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c b/gcc/testsuite/gcc.target/powerpc/pr103109.c
new file mode 100644
index 00000000000..7f67816edda
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c
@@ -0,0 +1,62 @@
+/* { dg-require-effective-target int128     } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* This test makes sure that GCC generates the maddhd, maddhdu, and maddld
+   power9 instructions when doing some forms of 64-bit integers converted to
+   128-bit integers and used with multiply/add operations.  */
+
+__int128_t
+s_mult_add (long long a,
+	    long long b,
+	    long long c)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) + (__int128_t)c;
+}
+
+/* Test 32-bit constants that are loaded into GPRs instead of doing the
+   mulld/mulhd and then addic/addime or addc/addze.  */
+__int128_t
+s_mult_add_m10 (long long a,
+		long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) - 10;
+}
+
+__int128_t
+s_mult_add_70000 (long long a,
+		  long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__int128_t)a * (__int128_t)b) + 70000;
+}
+
+__uint128_t
+u_mult_add (unsigned long long a,
+	    unsigned long long b,
+	    unsigned long long c)
+{
+  /* maddhd, maddld.  */
+  return ((__uint128_t)a * (__uint128_t)b) + (__uint128_t)c;
+}
+
+__uint128_t
+u_mult_add_0x80000000 (unsigned long long a,
+		       unsigned long long b)
+{
+  /* maddhd, maddld.  */
+  return ((__uint128_t)a * (__uint128_t)b) + 0x80000000UL;
+}
+
+/* { dg-final { scan-assembler-not   {\maddc\M}     } } */
+/* { dg-final { scan-assembler-not   {\madde\M}     } } */
+/* { dg-final { scan-assembler-not   {\maddid\M}    } } */
+/* { dg-final { scan-assembler-not   {\maddme\M}    } } */
+/* { dg-final { scan-assembler-not   {\maddze\M}    } } */
+/* { dg-final { scan-assembler-not   {\mmulhd\M}    } } */
+/* { dg-final { scan-assembler-not   {\mmulld\M}    } } */
+/* { dg-final { scan-assembler-times {\mmaddhd\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mmaddld\M} 3 } } */
+


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/meissner/heads/work080)] Optimize multiply/add of DImode extended to TImode.
@ 2022-03-08  3:56 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2022-03-08  3:56 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:642a4445455175feeefa62e17cd2e84c6a4df415

commit 642a4445455175feeefa62e17cd2e84c6a4df415
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Mon Mar 7 22:56:27 2022 -0500

    Optimize multiply/add of DImode extended to TImode.
    
    On power9 and power10 systems, we have instructions that support doing
    64-bit integers converted to 128-bit integers and producing 128-bit
    results.  This patch adds support to generate these instructions.
    
    Previously we had define_expands to handle conversion of the 64-bit extend
    to 128-bit and multiply.  This patch changes these define_expands to
    define_insn_and_split and then it provides combiner patterns to generate
    thes multiply/add instructions.
    
    To support using this optimization on power9, we extend the sign extend
    DImode to TImode to also run on power9 (added for PR target/104698).
    
    We add support for doing an unsigned DImode to TImode conversion.  We need
    these conversions to exist on power9 so that the combiner can properly
    combine the extend, multiply, and add instructions.
    
    2022-03-07   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
            PR target/103109
            * config/rs6000/rs6000.md (su_int32): New code attribute.
            (<u>mul<mode><dmode>3): Convert from define_expand to
            define_insn_and_split.
            (maddld<mode>4): Add generator function.
            (<u>mulditi3_<u>adddi3): New insn.
            (<u>mulditi3_add_const): New insn.
            (<u>mulditi3_<u>adddi3_upper): New insn.
            (addti3): Convert from define_expand to define_insn_and_split.
            (subti3): Likewise.
            * config/rs6000/vsx.md (extendditi2): Allow on power9 systems.
            Add isa attribute for the stuff that needs power10 support.
            (zero_extendditi2): New insn.
    
    gcc/testsuite/
            PR target/103109
            * gcc.target/powerpc/pr103109.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000.md | 166 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 149 insertions(+), 17 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index fdfbc6566a5..da7367ee642 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -676,6 +676,9 @@
 		       (float		"")
 		       (unsigned_float	"uns")])
 
+(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand")
+			    (zero_extend "c32bit_cint_operand")])
+
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI    "b")
@@ -3199,13 +3202,16 @@
   "mulhw<u> %0,%1,%2"
   [(set_attr "type" "mul")])
 
-(define_expand "<u>mul<mode><dmode>3"
-  [(set (match_operand:<DMODE> 0 "gpc_reg_operand")
+(define_insn_and_split "<u>mul<mode><dmode>3"
+  [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r")
 	(mult:<DMODE> (any_extend:<DMODE>
-			(match_operand:GPR 1 "gpc_reg_operand"))
+		       (match_operand:GPR 1 "gpc_reg_operand" "r"))
 		      (any_extend:<DMODE>
-			(match_operand:GPR 2 "gpc_reg_operand"))))]
+		       (match_operand:GPR 2 "gpc_reg_operand" "r"))))]
   "!(<MODE>mode == SImode && TARGET_POWERPC64)"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx l = gen_reg_rtx (<MODE>mode);
   rtx h = gen_reg_rtx (<MODE>mode);
@@ -3214,9 +3220,10 @@
   emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l);
   emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h);
   DONE;
-})
+}
+  [(set_attr "length" "8")])
 
-(define_insn "*maddld<mode>4"
+(define_insn "maddld<mode>4"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
 	(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
 			    (match_operand:GPR 2 "gpc_reg_operand" "r"))
@@ -3225,6 +3232,115 @@
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])
 
+(define_insn_and_split "*<u>mulditi3_<u>adddi3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize 128-bit multiply with zero/sign extend and adding a constant.  We
+;; force the constant into a register to generate li, maddhd, and maddld,
+;; instead of mulld, mulhd, addic, and addze.  We can't combine this pattern
+;; with the pattern that handles registers, since constants don't have a sign
+;; or zero extend around them.
+(define_insn_and_split "*<u>mulditi3_add_const"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
+	(plus:TI
+	 (mult:TI
+	  (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	 (match_operand 3 "<su_int32>" "r")))]
+  "TARGET_MADDLD && TARGET_POWERPC64
+"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx dest_hi = gen_highpart (DImode, dest);
+  rtx dest_lo = gen_lowpart (DImode, dest);
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = force_reg (DImode, operands[3]);
+  rtx tmp_hi, tmp_lo;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp_hi = gen_reg_rtx (DImode);
+      tmp_lo = gen_reg_rtx (DImode);
+    }
+  else
+    {
+      tmp_hi = dest_hi;
+      tmp_lo = dest_lo;
+    }
+
+  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
+  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
+
+  if (can_create_pseudo_p ())
+    {
+      emit_move_insn (dest_hi, tmp_hi);
+      emit_move_insn (dest_lo, tmp_lo);
+    }
+  DONE;
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "mul")
+   (set_attr "size" "64")])
+
+(define_insn "<u>mulditi3_<u>adddi3_upper"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(truncate:DI
+	 (lshiftrt:TI
+	  (plus:TI
+	   (mult:TI
+	    (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+	    (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	   (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+	  (const_int 64))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+  "maddhd<u> %0,%1,%2,%3"
+  [(set_attr "type" "mul")
+   (set_attr "size" "64")])
+
 (define_insn "udiv<mode>3"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
         (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
@@ -7029,12 +7145,19 @@
 ;; allocator from allocating registers that overlap with the inputs
 ;; (for example, having an input in 7,8 and an output in 6,7).  We
 ;; also allow for the output being the same as one of the inputs.
-
-(define_expand "addti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(plus:TI (match_operand:TI 1 "gpc_reg_operand")
-		 (match_operand:TI 2 "reg_or_short_operand")))]
+;;
+;; Addti3/subti3 are define_insn_and_splits instead of define_expand, to allow
+;; for combine to make things like multiply and add with extend operations.
+
+(define_insn_and_split "addti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(plus:TI (match_operand:TI 1 "gpc_reg_operand" "r,0,r")
+		 (match_operand:TI 2 "reg_or_short_operand" "rn,r,0")))
+   (clobber (reg:DI CA_REGNO))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7051,13 +7174,19 @@
   emit_insn (gen_adddi3_carry (lo0, lo1, lo2));
   emit_insn (gen_adddi3_carry_in (hi0, hi1, hi2));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 
-(define_expand "subti3"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(minus:TI (match_operand:TI 1 "reg_or_short_operand")
-		  (match_operand:TI 2 "gpc_reg_operand")))]
+(define_insn_and_split "subti3"
+  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r,r,r")
+	(minus:TI (match_operand:TI 1 "reg_or_short_operand" "rn,0,r")
+		  (match_operand:TI 2 "gpc_reg_operand" "r,r,0")))]
   "TARGET_64BIT"
+  "#"
+  "&& 1"
+  [(pc)]
 {
   rtx lo0 = gen_lowpart (DImode, operands[0]);
   rtx lo1 = gen_lowpart (DImode, operands[1]);
@@ -7074,7 +7203,10 @@
   emit_insn (gen_subfdi3_carry (lo0, lo2, lo1));
   emit_insn (gen_subfdi3_carry_in (hi0, hi2, hi1));
   DONE;
-})
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "add")
+   (set_attr "size" "128")])
 \f
 ;; 128-bit logical operations expanders


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-03-08  7:07 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-07 23:43 [gcc(refs/users/meissner/heads/work080)] Optimize multiply/add of DImode extended to TImode Michael Meissner
2022-03-08  3:56 Michael Meissner
2022-03-08  3:59 Michael Meissner
2022-03-08  7:07 Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).