public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-1826] PR target/91681: zero_extendditi2 pattern for more optimizations on x86.
@ 2022-07-25 16:37 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2022-07-25 16:37 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:16aafa3194d4851a07cc204f56a5f0618f77e5d7

commit r13-1826-g16aafa3194d4851a07cc204f56a5f0618f77e5d7
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Mon Jul 25 17:33:48 2022 +0100

    PR target/91681: zero_extendditi2 pattern for more optimizations on x86.
    
    Technically, PR target/91681 has already been resolved; we now recognize the
    highpart multiplication at the tree-level, we no longer use the stack, and
    we currently generate the same number of instructions as LLVM.  However, it
    is still possible to do better, the current x86_64 code to generate a double
    word addition of a zero extended operand, looks like:
    
            xorl    %r11d, %r11d
            addq    %r10, %rax
            adcq    %r11, %rdx
    
    when it's possible (as LLVM does) to use an immediate constant:
    
            addq    %r10, %rax
            adcq    $0, %rdx
    
    This is implemented by introducing a zero_extendditi2 pattern,
    for zero extension from DImode to TImode on TARGET_64BIT that is
    split after reload.  With zero extension now visible to combine,
    we add two new define_insn_and_split that add/subtract a zero
    extended operand in double word mode.  These apply to both 32-bit
    and 64-bit code generation, to produce adc $0 and sbb $0.
    
    One consequence of this is that these new patterns interfere with
    the optimization that recognizes DW:DI = (HI:SI<<32)+LO:SI as a pair
    of register moves, or more accurately the combine splitter no longer
    triggers as we're now converting two instructions into two instructions
    (not three instructions into two instructions).  This is easily
    repaired (and extended to handle TImode) by changing from a pair
    of define_split (that handle operand commutativity) to a set of
    four define_insn_and_split (again to handle operand commutativity).
    
    2022-07-25  Roger Sayle  <roger@nextmovesoftware.com>
                Uroš Bizjak  <ubizjak@gmail.com>
    
    gcc/ChangeLog
            PR target/91681
            * config/i386/i386-expand.cc (split_double_concat): A new helper
            function for setting a double word value from two word values.
            * config/i386/i386-protos.h (split_double_concat): Prototype here.
            * config/i386/i386.md (zero_extendditi2): New define_insn_and_split.
            (*add<dwi>3_doubleword_zext): New define_insn_and_split.
            (*sub<dwi>3_doubleword_zext): New define_insn_and_split.
            (*concat<mode><dwi>3_1): New define_insn_and_split replacing
            previous define_split for implementing DST = (HI<<32)|LO as
            pair of move instructions, setting lopart and hipart.
            (*concat<mode><dwi>3_2): Likewise.
            (*concat<mode><dwi>3_3): Likewise, where HI is zero_extended.
            (*concat<mode><dwi>3_4): Likewise, where HI is zero_extended.
    
    gcc/testsuite/ChangeLog
            PR target/91681
            * g++.target/i386/pr91681.C: New test case (from the PR).
            * gcc.target/i386/pr91681-1.c: New int128 test case.
            * gcc.target/i386/pr91681-2.c: Likewise.
            * gcc.target/i386/pr91681-3.c: Likewise, but for ia32.

Diff:
---
 gcc/config/i386/i386-expand.cc            |  40 +++++++++
 gcc/config/i386/i386-protos.h             |   1 +
 gcc/config/i386/i386.md                   | 140 +++++++++++++++++++++++++-----
 gcc/testsuite/g++.target/i386/pr91681.C   |  20 +++++
 gcc/testsuite/gcc.target/i386/pr91681-1.c |  20 +++++
 gcc/testsuite/gcc.target/i386/pr91681-2.c |  20 +++++
 gcc/testsuite/gcc.target/i386/pr91681-3.c |  16 ++++
 7 files changed, 233 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 40f821e7a11..66d8f28984c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -165,6 +165,46 @@ split_double_mode (machine_mode mode, rtx operands[],
     }
 }
 
+/* Emit the double word assignment DST = { LO, HI }.  */
+
+void
+split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
+{
+  rtx dlo, dhi;
+  int deleted_move_count = 0;
+  split_double_mode (mode, &dst, 1, &dlo, &dhi);
+  if (!rtx_equal_p (dlo, hi))
+    {
+      if (!rtx_equal_p (dlo, lo))
+	emit_move_insn (dlo, lo);
+      else
+	deleted_move_count++;
+      if (!rtx_equal_p (dhi, hi))
+	emit_move_insn (dhi, hi);
+      else
+	deleted_move_count++;
+    }
+  else if (!rtx_equal_p (lo, dhi))
+    {
+      if (!rtx_equal_p (dhi, hi))
+	emit_move_insn (dhi, hi);
+      else
+	deleted_move_count++;
+      if (!rtx_equal_p (dlo, lo))
+	emit_move_insn (dlo, lo);
+      else
+	deleted_move_count++;
+    }
+  else if (mode == TImode)
+    emit_insn (gen_swapdi (dlo, dhi));
+  else
+    emit_insn (gen_swapsi (dlo, dhi));
+
+  if (deleted_move_count == 2)
+    emit_note (NOTE_INSN_DELETED);
+}
+
+
 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
    for the target.  */
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index cf847751ac5..e27c14ff783 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -85,6 +85,7 @@ extern void print_reg (rtx, int, FILE*);
 extern void ix86_print_operand (FILE *, rtx, int);
 
 extern void split_double_mode (machine_mode, rtx[], int, rtx[], rtx[]);
+extern void split_double_concat (machine_mode, rtx, rtx lo, rtx);
 
 extern const char *output_set_got (rtx, rtx);
 extern const char *output_387_binary_op (rtx_insn *, rtx*);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9aaeb695f0f..fab6aed5e42 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4116,6 +4116,16 @@
 
 ;; Zero extension instructions
 
+(define_insn_and_split "zero_extendditi2"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=r,o")
+	(zero_extend:TI (match_operand:DI 1 "nonimmediate_operand" "rm,r")))]
+  "TARGET_64BIT"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (const_int 0))]
+  "split_double_mode (TImode, &operands[0], 1, &operands[3], &operands[4]);")
+
 (define_expand "zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand")
 	(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand")))])
@@ -5814,6 +5824,31 @@
     }
 })
 
+(define_insn_and_split "*add<dwi>3_doubleword_zext"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=r,o")
+	(plus:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm,r")) 
+	  (match_operand:<DWI> 1 "nonimmediate_operand" "0,0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (UNKNOWN, <DWI>mode, operands)"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (plus:DWIH (match_dup 1) (match_dup 2))
+		     (match_dup 1)))
+	      (set (match_dup 0)
+		   (plus:DWIH (match_dup 1) (match_dup 2)))])
+   (parallel [(set (match_dup 3)
+		   (plus:DWIH
+		     (plus:DWIH
+		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+		       (match_dup 4))
+		     (const_int 0)))
+	      (clobber (reg:CC FLAGS_REG))])]
+ "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[3]);")
+
 (define_insn "*add<mode>_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r")
 	(plus:SWI48
@@ -6962,6 +6997,29 @@
     }
 })
 
+(define_insn_and_split "*sub<dwi>3_doubleword_zext"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=r,o")
+	(minus:<DWI>
+	  (match_operand:<DWI> 1 "nonimmediate_operand" "0,0")
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm,r"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (UNKNOWN, <DWI>mode, operands)"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CC FLAGS_REG)
+		   (compare:CC (match_dup 1) (match_dup 2)))
+	      (set (match_dup 0)
+		   (minus:DWIH (match_dup 1) (match_dup 2)))])
+   (parallel [(set (match_dup 3)
+		   (minus:DWIH
+		     (minus:DWIH
+		       (match_dup 4)
+		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0)))
+		     (const_int 0)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[3]);")
+
 (define_insn "*sub<mode>_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
 	(minus:SWI
@@ -11111,34 +11169,68 @@
 
 ;; Split DST = (HI<<32)|LO early to minimize register usage.
 (define_code_iterator any_or_plus [plus ior xor])
-(define_split
-  [(set (match_operand:DI 0 "register_operand")
-	(any_or_plus:DI
-	  (ashift:DI (match_operand:DI 1 "register_operand")
-		     (const_int 32))
-	  (zero_extend:DI (match_operand:SI 2 "register_operand"))))]
-  "!TARGET_64BIT"
-  [(set (match_dup 3) (match_dup 4))
-   (set (match_dup 5) (match_dup 2))]
+(define_insn_and_split "*concat<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+	(any_or_plus:<DWI>
+	  (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r")
+			(match_operand:<DWI> 2 "const_int_operand"))
+	  (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
+  "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
 {
-  operands[3] = gen_highpart (SImode, operands[0]);
-  operands[4] = gen_lowpart (SImode, operands[1]);
-  operands[5] = gen_lowpart (SImode, operands[0]);
+  split_double_concat (<DWI>mode, operands[0], operands[3],
+		       gen_lowpart (<MODE>mode, operands[1]));
+  DONE;
 })
 
-(define_split
-  [(set (match_operand:DI 0 "register_operand")
-	(any_or_plus:DI
-	  (zero_extend:DI (match_operand:SI 1 "register_operand"))
-	  (ashift:DI (match_operand:DI 2 "register_operand")
-		     (const_int 32))))]
-  "!TARGET_64BIT"
-  [(set (match_dup 3) (match_dup 4))
-   (set (match_dup 5) (match_dup 1))]
+(define_insn_and_split "*concat<mode><dwi>3_2"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+	(any_or_plus:<DWI>
+	  (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
+	  (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r")
+			(match_operand:<DWI> 3 "const_int_operand"))))]
+  "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
+{
+  split_double_concat (<DWI>mode, operands[0], operands[1],
+		       gen_lowpart (<MODE>mode, operands[2]));
+  DONE;
+})
+
+(define_insn_and_split "*concat<mode><dwi>3_3"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+	(any_or_plus:<DWI>
+	  (ashift:<DWI>
+	    (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
+	    (match_operand:<DWI> 2 "const_int_operand"))
+	  (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
+  "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
 {
-  operands[3] = gen_highpart (SImode, operands[0]);
-  operands[4] = gen_lowpart (SImode, operands[2]);
-  operands[5] = gen_lowpart (SImode, operands[0]);
+  split_double_concat (<DWI>mode, operands[0], operands[3], operands[1]);
+  DONE;
+})
+
+(define_insn_and_split "*concat<mode><dwi>3_4"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+	(any_or_plus:<DWI>
+	  (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
+	  (ashift:<DWI>
+	    (zero_extend:<DWI> (match_operand:DWIH 2 "register_operand" "r"))
+	    (match_operand:<DWI> 3 "const_int_operand"))))]
+  "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
+{
+  split_double_concat (<DWI>mode, operands[0], operands[1], operands[2]);
+  DONE;
 })
 \f
 ;; Negation instructions
diff --git a/gcc/testsuite/g++.target/i386/pr91681.C b/gcc/testsuite/g++.target/i386/pr91681.C
new file mode 100644
index 00000000000..0271e43ad3b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr91681.C
@@ -0,0 +1,20 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+void multiply128x64x2_3 ( 
+    const unsigned long a, 
+    const unsigned long b, 
+    const unsigned long c, 
+    const unsigned long d, 
+    __uint128_t o[2])
+{
+    __uint128_t B0 = (__uint128_t) b * c;
+    __uint128_t B2 = (__uint128_t) a * c;
+    __uint128_t B1 = (__uint128_t) b * d;
+    __uint128_t B3 = (__uint128_t) a * d;
+
+    o[0] = B2 + (B0 >> 64);
+    o[1] = B3 + (B1 >> 64);
+}
+
+/* { dg-final { scan-assembler-not "xor" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr91681-1.c b/gcc/testsuite/gcc.target/i386/pr91681-1.c
new file mode 100644
index 00000000000..ab83cc4b302
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91681-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+unsigned __int128 m;
+
+unsigned __int128 foo(unsigned __int128 x, unsigned long long y)
+{
+    return x + y;
+}
+
+void bar(unsigned __int128 x, unsigned long long y)
+{
+    m = x + y;
+}
+
+void baz(unsigned long long y)
+{
+    m += y;
+}
+
+/* { dg-final { scan-assembler-not "xor" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr91681-2.c b/gcc/testsuite/gcc.target/i386/pr91681-2.c
new file mode 100644
index 00000000000..ea52c72ed6b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91681-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+unsigned __int128 m;
+
+unsigned __int128 foo(unsigned __int128 x, unsigned long long y)
+{
+    return x - y;
+}
+
+void bar(unsigned __int128 x, unsigned long long y)
+{
+    m = x - y;
+}
+
+void baz(unsigned long long y)
+{
+    m -= y;
+}
+
+/* { dg-final { scan-assembler-not "xor" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr91681-3.c b/gcc/testsuite/gcc.target/i386/pr91681-3.c
new file mode 100644
index 00000000000..22a03c27db2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91681-3.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2" } */
+
+unsigned long long m;
+
+unsigned long long foo(unsigned long long x, unsigned int y)
+{
+    return x - y;
+}
+
+void bar(unsigned long long x, unsigned int y)
+{
+    m = x - y;
+}
+
+/* { dg-final { scan-assembler-not "xor" } } */


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-07-25 16:37 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-25 16:37 [gcc r13-1826] PR target/91681: zero_extendditi2 pattern for more optimizations on x86 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).