public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-4435] i386: Improve *concat<mode><dwi>3_{1, 2, 3, 4} patterns [PR107627]
@ 2022-12-01  8:35 Jakub Jelinek
  0 siblings, 0 replies; only message in thread
From: Jakub Jelinek @ 2022-12-01  8:35 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:2c089640279614e34b8712bfb318a9d4fc8ac8fe

commit r13-4435-g2c089640279614e34b8712bfb318a9d4fc8ac8fe
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Thu Dec 1 09:29:23 2022 +0100

    i386: Improve *concat<mode><dwi>3_{1,2,3,4} patterns [PR107627]
    
    On the first testcase we've regressed since 12 at -O2:
    -       movq    8(%rsi), %rax
    -       movq    %rdi, %r8
    -       movq    (%rsi), %rdi
    +       movq    (%rsi), %rax
    +       movq    8(%rsi), %r8
            movl    %edx, %ecx
    -       shrdq   %rdi, %rax
    -       movq    %rax, (%r8)
    +       xorl    %r9d, %r9d
    +       movq    %rax, %rdx
    +       xorl    %eax, %eax
    +       orq     %r8, %rax
    +       orq     %r9, %rdx
    +       shrdq   %rdx, %rax
    +       movq    %rax, (%rdi)
    On the second testcase we've emitted such terrible code
    with the useless xors and ors for a long time.
    For PR91681 the *concat<mode><dwi>3_{1,2,3,4} patterns have been added
    but they allow just register inputs and register or memory offsettable
    output.
    The following patch fixes this by allowing also memory inputs on those
    patterns, because the pattern is then split to 0-2 emit_move_insns or
    one xchg and those can handle loads from memory too just fine.
    So that we don't narrow memory loads (source has 128-bit (or for ia32
    64-bit) load and we would make 64-bit (or for ia32 32-bit) load out of it),
    register_operand -> nonmemory_operand change is done only for operands
    in zero_extend arguments.  o <- m, m or o <- m, r or o <- r, m alternatives
    aren't used, we'd lack registers to perform the moves.  But what is
    in addition to the current ro <- r, r supported are r <- m, r and r <- r, m
    (in that case we just need to be careful about corner cases, see what
    emit_move_insn we'd call and if we wouldn't clobber registers used in m's
    address before loading - split_double_concat handles that now) and
    &r <- m, m (in that case I think the early clobber is the easiest solution).
    
    The first testcase then on 12 -> patched trunk at -O2 changes:
    -       movq    8(%rsi), %rax
    -       movq    %rdi, %r8
    -       movq    (%rsi), %rdi
    +       movq    8(%rsi), %r9
    +       movq    (%rsi), %r10
            movl    %edx, %ecx
    -       shrdq   %rdi, %rax
    -       movq    %rax, (%r8)
    +       movq    %r9, %rax
    +       shrdq   %r10, %rax
    +       movq    %rax, (%rdi)
    so same amount of instructions and second testcase 12 -> patched trunk
    at -O2 -m32:
    -       pushl   %edi
    -       xorl    %edi, %edi
            pushl   %esi
    -       movl    16(%esp), %esi
    +       pushl   %ebx
    +       movl    16(%esp), %eax
            movl    20(%esp), %ecx
    -       movl    (%esi), %eax
    -       movl    4(%esi), %esi
    -       movl    %eax, %edx
    -       movl    $0, %eax
    -       orl     %edi, %edx
    -       orl     %esi, %eax
    -       shrdl   %edx, %eax
            movl    12(%esp), %edx
    +       movl    4(%eax), %ebx
    +       movl    (%eax), %esi
    +       movl    %ebx, %eax
    +       shrdl   %esi, %eax
            movl    %eax, (%edx)
    +       popl    %ebx
            popl    %esi
    -       popl    %edi
    
    BTW, I wonder if we couldn't add additional patterns which would catch
    the case where one of the operands is constant and how does this interact
    with the stv pass in 32-bit mode where I think stv is right after combine,
    so if we match these patterns, perhaps it would be nice to handle them
    in stv (unless they are handled there already).
    
    2022-12-01  Jakub Jelinek  <jakub@redhat.com>
    
            PR target/107627
            * config/i386/i386.md (*concat<mode><dwi>3_1, *concat<mode><dwi>3_2):
            For operands which are zero_extend arguments allow memory if
            output operand is a register.
            (*concat<mode><dwi>3_3, *concat<mode><dwi>3_4): Likewise.  If
            both input operands are memory, use early clobber on output operand.
            * config/i386/i386-expand.cc (split_double_concat): Deal with corner
            cases where one input is memory and the other is not and the address
            of the memory input uses a register we'd overwrite before loading
            the memory into a register.
    
            * gcc.target/i386/pr107627-1.c: New test.
            * gcc.target/i386/pr107627-2.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc             | 27 +++++++++++++++++++++++++++
 gcc/config/i386/i386.md                    | 30 ++++++++++++++++++------------
 gcc/testsuite/gcc.target/i386/pr107627-1.c | 22 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107627-2.c | 22 ++++++++++++++++++++++
 4 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ec3921bdc62..f20bf81f393 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -173,6 +173,33 @@ split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
   rtx dlo, dhi;
   int deleted_move_count = 0;
   split_double_mode (mode, &dst, 1, &dlo, &dhi);
+  /* Constraints ensure that if both lo and hi are MEMs, then
+     dst has early-clobber and thus addresses of MEMs don't use
+     dlo/dhi registers.  Otherwise if at least one of li and hi are MEMs,
+     dlo/dhi are registers.  */
+  if (MEM_P (lo)
+      && rtx_equal_p (dlo, hi)
+      && reg_overlap_mentioned_p (dhi, lo))
+    {
+      /* If dlo is same as hi and lo's address uses dhi register,
+	 code below would first emit_move_insn (dhi, hi)
+	 and then emit_move_insn (dlo, lo).  But the former
+	 would invalidate lo's address.  Load into dhi first,
+	 then swap.  */
+      emit_move_insn (dhi, lo);
+      lo = dhi;
+    }
+  else if (MEM_P (hi)
+	   && !MEM_P (lo)
+	   && !rtx_equal_p (dlo, lo)
+	   && reg_overlap_mentioned_p (dlo, hi))
+    {
+      /* In this case, code below would first emit_move_insn (dlo, lo)
+	 and then emit_move_insn (dhi, hi).  But the former would
+	 invalidate hi's address.  Load into dhi first.  */
+      emit_move_insn (dhi, hi);
+      hi = dhi;
+    }
   if (!rtx_equal_p (dlo, hi))
     {
       if (!rtx_equal_p (dlo, lo))
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9451883396c..e9ca7f3f1ad 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -11396,11 +11396,12 @@
 ;; Split DST = (HI<<32)|LO early to minimize register usage.
 (define_code_iterator any_or_plus [plus ior xor])
 (define_insn_and_split "*concat<mode><dwi>3_1"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
 	(any_or_plus:<DWI>
-	  (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r")
+	  (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r,r")
 			(match_operand:<DWI> 2 "const_int_operand"))
-	  (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 3 "nonimmediate_operand" "r,m"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
@@ -11412,10 +11413,11 @@
 })
 
 (define_insn_and_split "*concat<mode><dwi>3_2"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
 	(any_or_plus:<DWI>
-	  (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
-	  (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r")
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
+	  (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r,r")
 			(match_operand:<DWI> 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
@@ -11428,12 +11430,14 @@
 })
 
 (define_insn_and_split "*concat<mode><dwi>3_3"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
 	(any_or_plus:<DWI>
 	  (ashift:<DWI>
-	    (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
+	    (zero_extend:<DWI>
+	      (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
 	    (match_operand:<DWI> 2 "const_int_operand"))
-	  (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
@@ -11444,11 +11448,13 @@
 })
 
 (define_insn_and_split "*concat<mode><dwi>3_4"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
 	(any_or_plus:<DWI>
-	  (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
 	  (ashift:<DWI>
-	    (zero_extend:<DWI> (match_operand:DWIH 2 "register_operand" "r"))
+	    (zero_extend:<DWI>
+	      (match_operand:DWIH 2 "nonimmediate_operand" "r,r,m,m"))
 	    (match_operand:<DWI> 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
diff --git a/gcc/testsuite/gcc.target/i386/pr107627-1.c b/gcc/testsuite/gcc.target/i386/pr107627-1.c
new file mode 100644
index 00000000000..60ec87bdf94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107627-1.c
@@ -0,0 +1,22 @@
+/* PR target/107627 */
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-not "\torq\t" } } */
+
+static inline unsigned __int128
+foo (unsigned long long x, unsigned long long y)
+{
+  return ((unsigned __int128) x << 64) | y;
+}
+
+static inline unsigned long long
+bar (unsigned long long x, unsigned long long y, unsigned z)
+{
+  return foo (x, y) >> (z % 64);
+}
+
+void
+baz (unsigned long long *x, const unsigned long long *y, unsigned z)
+{
+  x[0] = bar (y[0], y[1], z);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107627-2.c b/gcc/testsuite/gcc.target/i386/pr107627-2.c
new file mode 100644
index 00000000000..7f96613e846
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107627-2.c
@@ -0,0 +1,22 @@
+/* PR target/107627 */
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-not "\torl\t" } } */
+
+static inline unsigned long long
+qux (unsigned int x, unsigned int y)
+{
+  return ((unsigned long long) x << 32) | y;
+}
+
+static inline unsigned int
+corge (unsigned int x, unsigned int y, unsigned z)
+{
+  return qux (x, y) >> (z % 32);
+}
+
+void
+garply (unsigned int *x, const unsigned int *y, unsigned z)
+{
+  x[0] = corge (y[0], y[1], z);
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-12-01  8:35 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-01  8:35 [gcc r13-4435] i386: Improve *concat<mode><dwi>3_{1, 2, 3, 4} patterns [PR107627] Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).