public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] i386: Improve *concat<mode><dwi>3_{1,2,3,4} patterns [PR107627]
@ 2022-12-01  8:09 Jakub Jelinek
  2022-12-01  8:23 ` Uros Bizjak
  2022-12-08  9:48 ` [PATCH] i386: Add *concat<mode><dwi>3_{5,6,7} " Jakub Jelinek
  0 siblings, 2 replies; 4+ messages in thread
From: Jakub Jelinek @ 2022-12-01  8:09 UTC (permalink / raw)
  To: Uros Bizjak, Roger Sayle; +Cc: gcc-patches

Hi!

On the first testcase we've regressed since 12 at -O2:
-       movq    8(%rsi), %rax
-       movq    %rdi, %r8
-       movq    (%rsi), %rdi
+       movq    (%rsi), %rax
+       movq    8(%rsi), %r8
        movl    %edx, %ecx
-       shrdq   %rdi, %rax
-       movq    %rax, (%r8)
+       xorl    %r9d, %r9d
+       movq    %rax, %rdx
+       xorl    %eax, %eax
+       orq     %r8, %rax
+       orq     %r9, %rdx
+       shrdq   %rdx, %rax
+       movq    %rax, (%rdi)
On the second testcase we've emitted such terrible code
with the useless xors and ors for a long time.
For PR91681 the *concat<mode><dwi>3_{1,2,3,4} patterns have been added
but they allow just register inputs and register or memory offsettable
output.
The following patch fixes this by allowing also memory inputs on those
patterns, because the pattern is then split to 0-2 emit_move_insns or
one xchg and those can handle loads from memory too just fine.
So that we don't narrow memory loads (source has 128-bit (or for ia32
64-bit) load and we would make 64-bit (or for ia32 32-bit) load out of it),
register_operand -> nonmemory_operand change is done only for operands
in zero_extend arguments.  o <- m, m or o <- m, r or o <- r, m alternatives
aren't used, we'd lack registers to perform the moves.  But what is
in addition to the current ro <- r, r supported are r <- m, r and r <- r, m
(in that case we just need to be careful about corner cases, see what
emit_move_insn we'd call and if we wouldn't clobber registers used in m's
address before loading - split_double_concat handles that now) and
&r <- m, m (in that case I think the early clobber is the easiest solution).

The first testcase then on 12 -> patched trunk at -O2 changes:
-       movq    8(%rsi), %rax
-       movq    %rdi, %r8
-       movq    (%rsi), %rdi
+       movq    8(%rsi), %r9
+       movq    (%rsi), %r10
        movl    %edx, %ecx
-       shrdq   %rdi, %rax
-       movq    %rax, (%r8)
+       movq    %r9, %rax
+       shrdq   %r10, %rax
+       movq    %rax, (%rdi)
so same amount of instructions and second testcase 12 -> patched trunk
at -O2 -m32:
-	pushl	%edi
-	xorl	%edi, %edi
 	pushl	%esi
-	movl	16(%esp), %esi
+	pushl	%ebx
+	movl	16(%esp), %eax
 	movl	20(%esp), %ecx
-	movl	(%esi), %eax
-	movl	4(%esi), %esi
-	movl	%eax, %edx
-	movl	$0, %eax
-	orl	%edi, %edx
-	orl	%esi, %eax
-	shrdl	%edx, %eax
 	movl	12(%esp), %edx
+	movl	4(%eax), %ebx
+	movl	(%eax), %esi
+	movl	%ebx, %eax
+	shrdl	%esi, %eax
 	movl	%eax, (%edx)
+	popl	%ebx
 	popl	%esi
-	popl	%edi

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

BTW, I wonder if we couldn't add additional patterns which would catch
the case where one of the operands is constant and how does this interact
with the stv pass in 32-bit mode where I think stv is right after combine,
so if we match these patterns, perhaps it would be nice to handle them
in stv (unless they are handled there already).

2022-12-01  Jakub Jelinek  <jakub@redhat.com>

	PR target/107627
	* config/i386/i386.md (*concat<mode><dwi>3_1, *concat<mode><dwi>3_2):
	For operands which are zero_extend arguments allow memory if
	output operand is a register.
	(*concat<mode><dwi>3_3, *concat<mode><dwi>3_4): Likewise.  If
	both input operands are memory, use early clobber on output operand.
	* config/i386/i386-expand.cc (split_double_concat): Deal with corner
	cases where one input is memory and the other is not and the address
	of the memory input uses a register we'd overwrite before loading
	the memory into a register.

	* gcc.target/i386/pr107627-1.c: New test.
	* gcc.target/i386/pr107627-2.c: New test.

--- gcc/config/i386/i386.md.jj	2022-11-28 10:13:17.758656933 +0100
+++ gcc/config/i386/i386.md	2022-11-30 12:11:55.724474793 +0100
@@ -11396,11 +11396,12 @@ (define_insn "*xorqi_ext<mode>_1_cc"
 ;; Split DST = (HI<<32)|LO early to minimize register usage.
 (define_code_iterator any_or_plus [plus ior xor])
 (define_insn_and_split "*concat<mode><dwi>3_1"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
 	(any_or_plus:<DWI>
-	  (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r")
+	  (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r,r")
 			(match_operand:<DWI> 2 "const_int_operand"))
-	  (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 3 "nonimmediate_operand" "r,m"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
@@ -11412,10 +11413,11 @@ (define_insn_and_split "*concat<mode><dw
 })
 
 (define_insn_and_split "*concat<mode><dwi>3_2"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
 	(any_or_plus:<DWI>
-	  (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
-	  (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r")
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
+	  (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r,r")
 			(match_operand:<DWI> 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
@@ -11428,12 +11430,14 @@ (define_insn_and_split "*concat<mode><dw
 })
 
 (define_insn_and_split "*concat<mode><dwi>3_3"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
 	(any_or_plus:<DWI>
 	  (ashift:<DWI>
-	    (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
+	    (zero_extend:<DWI>
+	      (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
 	    (match_operand:<DWI> 2 "const_int_operand"))
-	  (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
@@ -11444,11 +11448,13 @@ (define_insn_and_split "*concat<mode><dw
 })
 
 (define_insn_and_split "*concat<mode><dwi>3_4"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
 	(any_or_plus:<DWI>
-	  (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
 	  (ashift:<DWI>
-	    (zero_extend:<DWI> (match_operand:DWIH 2 "register_operand" "r"))
+	    (zero_extend:<DWI>
+	      (match_operand:DWIH 2 "nonimmediate_operand" "r,r,m,m"))
 	    (match_operand:<DWI> 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
--- gcc/config/i386/i386-expand.cc.jj	2022-11-28 10:13:17.703657740 +0100
+++ gcc/config/i386/i386-expand.cc	2022-11-30 13:27:44.851737861 +0100
@@ -173,6 +173,33 @@ split_double_concat (machine_mode mode,
   rtx dlo, dhi;
   int deleted_move_count = 0;
   split_double_mode (mode, &dst, 1, &dlo, &dhi);
+  /* Constraints ensure that if both lo and hi are MEMs, then
+     dst has early-clobber and thus addresses of MEMs don't use
+     dlo/dhi registers.  Otherwise if at least one of li and hi are MEMs,
+     dlo/dhi are registers.  */
+  if (MEM_P (lo)
+      && rtx_equal_p (dlo, hi)
+      && reg_overlap_mentioned_p (dhi, lo))
+    {
+      /* If dlo is same as hi and lo's address uses dhi register,
+	 code below would first emit_move_insn (dhi, hi)
+	 and then emit_move_insn (dlo, lo).  But the former
+	 would invalidate lo's address.  Load into dhi first,
+	 then swap.  */
+      emit_move_insn (dhi, lo);
+      lo = dhi;
+    }
+  else if (MEM_P (hi)
+	   && !MEM_P (lo)
+	   && !rtx_equal_p (dlo, lo)
+	   && reg_overlap_mentioned_p (dlo, hi))
+    {
+      /* In this case, code below would first emit_move_insn (dlo, lo)
+	 and then emit_move_insn (dhi, hi).  But the former would
+	 invalidate hi's address.  Load into dhi first.  */
+      emit_move_insn (dhi, hi);
+      hi = dhi;
+    }
   if (!rtx_equal_p (dlo, hi))
     {
       if (!rtx_equal_p (dlo, lo))
--- gcc/testsuite/gcc.target/i386/pr107627-1.c.jj	2022-11-30 13:52:11.654818924 +0100
+++ gcc/testsuite/gcc.target/i386/pr107627-1.c	2022-11-30 13:53:40.288496872 +0100
@@ -0,0 +1,22 @@
+/* PR target/107627 */
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-not "\torq\t" } } */
+
+static inline unsigned __int128
+foo (unsigned long long x, unsigned long long y)
+{
+  return ((unsigned __int128) x << 64) | y;
+}
+
+static inline unsigned long long
+bar (unsigned long long x, unsigned long long y, unsigned z)
+{
+  return foo (x, y) >> (z % 64);
+}
+
+void
+baz (unsigned long long *x, const unsigned long long *y, unsigned z)
+{
+  x[0] = bar (y[0], y[1], z);
+}
--- gcc/testsuite/gcc.target/i386/pr107627-2.c.jj	2022-11-30 13:52:14.890770658 +0100
+++ gcc/testsuite/gcc.target/i386/pr107627-2.c	2022-11-30 13:53:32.863607618 +0100
@@ -0,0 +1,22 @@
+/* PR target/107627 */
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-not "\torl\t" } } */
+
+static inline unsigned long long
+qux (unsigned int x, unsigned int y)
+{
+  return ((unsigned long long) x << 32) | y;
+}
+
+static inline unsigned int
+corge (unsigned int x, unsigned int y, unsigned z)
+{
+  return qux (x, y) >> (z % 32);
+}
+
+void
+garply (unsigned int *x, const unsigned int *y, unsigned z)
+{
+  x[0] = corge (y[0], y[1], z);
+}

	Jakub


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] i386: Improve *concat<mode><dwi>3_{1,2,3,4} patterns [PR107627]
  2022-12-01  8:09 [PATCH] i386: Improve *concat<mode><dwi>3_{1,2,3,4} patterns [PR107627] Jakub Jelinek
@ 2022-12-01  8:23 ` Uros Bizjak
  2022-12-08  9:48 ` [PATCH] i386: Add *concat<mode><dwi>3_{5,6,7} " Jakub Jelinek
  1 sibling, 0 replies; 4+ messages in thread
From: Uros Bizjak @ 2022-12-01  8:23 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches

On Thu, Dec 1, 2022 at 9:10 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> On the first testcase we've regressed since 12 at -O2:
> -       movq    8(%rsi), %rax
> -       movq    %rdi, %r8
> -       movq    (%rsi), %rdi
> +       movq    (%rsi), %rax
> +       movq    8(%rsi), %r8
>         movl    %edx, %ecx
> -       shrdq   %rdi, %rax
> -       movq    %rax, (%r8)
> +       xorl    %r9d, %r9d
> +       movq    %rax, %rdx
> +       xorl    %eax, %eax
> +       orq     %r8, %rax
> +       orq     %r9, %rdx
> +       shrdq   %rdx, %rax
> +       movq    %rax, (%rdi)
> On the second testcase we've emitted such terrible code
> with the useless xors and ors for a long time.
> For PR91681 the *concat<mode><dwi>3_{1,2,3,4} patterns have been added
> but they allow just register inputs and register or memory offsettable
> output.
> The following patch fixes this by allowing also memory inputs on those
> patterns, because the pattern is then split to 0-2 emit_move_insns or
> one xchg and those can handle loads from memory too just fine.
> So that we don't narrow memory loads (source has 128-bit (or for ia32
> 64-bit) load and we would make 64-bit (or for ia32 32-bit) load out of it),
> register_operand -> nonmemory_operand change is done only for operands
> in zero_extend arguments.  o <- m, m or o <- m, r or o <- r, m alternatives
> aren't used, we'd lack registers to perform the moves.  But what is
> in addition to the current ro <- r, r supported are r <- m, r and r <- r, m
> (in that case we just need to be careful about corner cases, see what
> emit_move_insn we'd call and if we wouldn't clobber registers used in m's
> address before loading - split_double_concat handles that now) and
> &r <- m, m (in that case I think the early clobber is the easiest solution).
>
> The first testcase then on 12 -> patched trunk at -O2 changes:
> -       movq    8(%rsi), %rax
> -       movq    %rdi, %r8
> -       movq    (%rsi), %rdi
> +       movq    8(%rsi), %r9
> +       movq    (%rsi), %r10
>         movl    %edx, %ecx
> -       shrdq   %rdi, %rax
> -       movq    %rax, (%r8)
> +       movq    %r9, %rax
> +       shrdq   %r10, %rax
> +       movq    %rax, (%rdi)
> so same amount of instructions and second testcase 12 -> patched trunk
> at -O2 -m32:
> -       pushl   %edi
> -       xorl    %edi, %edi
>         pushl   %esi
> -       movl    16(%esp), %esi
> +       pushl   %ebx
> +       movl    16(%esp), %eax
>         movl    20(%esp), %ecx
> -       movl    (%esi), %eax
> -       movl    4(%esi), %esi
> -       movl    %eax, %edx
> -       movl    $0, %eax
> -       orl     %edi, %edx
> -       orl     %esi, %eax
> -       shrdl   %edx, %eax
>         movl    12(%esp), %edx
> +       movl    4(%eax), %ebx
> +       movl    (%eax), %esi
> +       movl    %ebx, %eax
> +       shrdl   %esi, %eax
>         movl    %eax, (%edx)
> +       popl    %ebx
>         popl    %esi
> -       popl    %edi
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> BTW, I wonder if we couldn't add additional patterns which would catch
> the case where one of the operands is constant and how does this interact
> with the stv pass in 32-bit mode where I think stv is right after combine,
> so if we match these patterns, perhaps it would be nice to handle them
> in stv (unless they are handled there already).

IIRC, STV handles only loads of 0 and -1, everything else has to be
loaded from memory. So, it leaves e.g.:

long long a;

long long test (void)
{
  return a + 1;
}

unconverted.

>
> 2022-12-01  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/107627
>         * config/i386/i386.md (*concat<mode><dwi>3_1, *concat<mode><dwi>3_2):
>         For operands which are zero_extend arguments allow memory if
>         output operand is a register.
>         (*concat<mode><dwi>3_3, *concat<mode><dwi>3_4): Likewise.  If
>         both input operands are memory, use early clobber on output operand.
>         * config/i386/i386-expand.cc (split_double_concat): Deal with corner
>         cases where one input is memory and the other is not and the address
>         of the memory input uses a register we'd overwrite before loading
>         the memory into a register.
>
>         * gcc.target/i386/pr107627-1.c: New test.
>         * gcc.target/i386/pr107627-2.c: New test.

LGTM.

Thanks,
Uros.

>
> --- gcc/config/i386/i386.md.jj  2022-11-28 10:13:17.758656933 +0100
> +++ gcc/config/i386/i386.md     2022-11-30 12:11:55.724474793 +0100
> @@ -11396,11 +11396,12 @@ (define_insn "*xorqi_ext<mode>_1_cc"
>  ;; Split DST = (HI<<32)|LO early to minimize register usage.
>  (define_code_iterator any_or_plus [plus ior xor])
>  (define_insn_and_split "*concat<mode><dwi>3_1"
> -  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
> +  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
>         (any_or_plus:<DWI>
> -         (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r")
> +         (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r,r")
>                         (match_operand:<DWI> 2 "const_int_operand"))
> -         (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
> +         (zero_extend:<DWI>
> +           (match_operand:DWIH 3 "nonimmediate_operand" "r,m"))))]
>    "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
>    "#"
>    "&& reload_completed"
> @@ -11412,10 +11413,11 @@ (define_insn_and_split "*concat<mode><dw
>  })
>
>  (define_insn_and_split "*concat<mode><dwi>3_2"
> -  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
> +  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
>         (any_or_plus:<DWI>
> -         (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
> -         (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r")
> +         (zero_extend:<DWI>
> +           (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
> +         (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r,r")
>                         (match_operand:<DWI> 3 "const_int_operand"))))]
>    "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
>    "#"
> @@ -11428,12 +11430,14 @@ (define_insn_and_split "*concat<mode><dw
>  })
>
>  (define_insn_and_split "*concat<mode><dwi>3_3"
> -  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
> +  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
>         (any_or_plus:<DWI>
>           (ashift:<DWI>
> -           (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
> +           (zero_extend:<DWI>
> +             (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
>             (match_operand:<DWI> 2 "const_int_operand"))
> -         (zero_extend:<DWI> (match_operand:DWIH 3 "register_operand" "r"))))]
> +         (zero_extend:<DWI>
> +           (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m"))))]
>    "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
>    "#"
>    "&& reload_completed"
> @@ -11444,11 +11448,13 @@ (define_insn_and_split "*concat<mode><dw
>  })
>
>  (define_insn_and_split "*concat<mode><dwi>3_4"
> -  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
> +  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
>         (any_or_plus:<DWI>
> -         (zero_extend:<DWI> (match_operand:DWIH 1 "register_operand" "r"))
> +         (zero_extend:<DWI>
> +           (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
>           (ashift:<DWI>
> -           (zero_extend:<DWI> (match_operand:DWIH 2 "register_operand" "r"))
> +           (zero_extend:<DWI>
> +             (match_operand:DWIH 2 "nonimmediate_operand" "r,r,m,m"))
>             (match_operand:<DWI> 3 "const_int_operand"))))]
>    "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
>    "#"
> --- gcc/config/i386/i386-expand.cc.jj   2022-11-28 10:13:17.703657740 +0100
> +++ gcc/config/i386/i386-expand.cc      2022-11-30 13:27:44.851737861 +0100
> @@ -173,6 +173,33 @@ split_double_concat (machine_mode mode,
>    rtx dlo, dhi;
>    int deleted_move_count = 0;
>    split_double_mode (mode, &dst, 1, &dlo, &dhi);
> +  /* Constraints ensure that if both lo and hi are MEMs, then
> +     dst has early-clobber and thus addresses of MEMs don't use
> +     dlo/dhi registers.  Otherwise if at least one of li and hi are MEMs,
> +     dlo/dhi are registers.  */
> +  if (MEM_P (lo)
> +      && rtx_equal_p (dlo, hi)
> +      && reg_overlap_mentioned_p (dhi, lo))
> +    {
> +      /* If dlo is same as hi and lo's address uses dhi register,
> +        code below would first emit_move_insn (dhi, hi)
> +        and then emit_move_insn (dlo, lo).  But the former
> +        would invalidate lo's address.  Load into dhi first,
> +        then swap.  */
> +      emit_move_insn (dhi, lo);
> +      lo = dhi;
> +    }
> +  else if (MEM_P (hi)
> +          && !MEM_P (lo)
> +          && !rtx_equal_p (dlo, lo)
> +          && reg_overlap_mentioned_p (dlo, hi))
> +    {
> +      /* In this case, code below would first emit_move_insn (dlo, lo)
> +        and then emit_move_insn (dhi, hi).  But the former would
> +        invalidate hi's address.  Load into dhi first.  */
> +      emit_move_insn (dhi, hi);
> +      hi = dhi;
> +    }
>    if (!rtx_equal_p (dlo, hi))
>      {
>        if (!rtx_equal_p (dlo, lo))
> --- gcc/testsuite/gcc.target/i386/pr107627-1.c.jj       2022-11-30 13:52:11.654818924 +0100
> +++ gcc/testsuite/gcc.target/i386/pr107627-1.c  2022-11-30 13:53:40.288496872 +0100
> @@ -0,0 +1,22 @@
> +/* PR target/107627 */
> +/* { dg-do compile { target int128 } } */
> +/* { dg-options "-O2 -masm=att" } */
> +/* { dg-final { scan-assembler-not "\torq\t" } } */
> +
> +static inline unsigned __int128
> +foo (unsigned long long x, unsigned long long y)
> +{
> +  return ((unsigned __int128) x << 64) | y;
> +}
> +
> +static inline unsigned long long
> +bar (unsigned long long x, unsigned long long y, unsigned z)
> +{
> +  return foo (x, y) >> (z % 64);
> +}
> +
> +void
> +baz (unsigned long long *x, const unsigned long long *y, unsigned z)
> +{
> +  x[0] = bar (y[0], y[1], z);
> +}
> --- gcc/testsuite/gcc.target/i386/pr107627-2.c.jj       2022-11-30 13:52:14.890770658 +0100
> +++ gcc/testsuite/gcc.target/i386/pr107627-2.c  2022-11-30 13:53:32.863607618 +0100
> @@ -0,0 +1,22 @@
> +/* PR target/107627 */
> +/* { dg-do compile { target ia32 } } */
> +/* { dg-options "-O2 -masm=att" } */
> +/* { dg-final { scan-assembler-not "\torl\t" } } */
> +
> +static inline unsigned long long
> +qux (unsigned int x, unsigned int y)
> +{
> +  return ((unsigned long long) x << 32) | y;
> +}
> +
> +static inline unsigned int
> +corge (unsigned int x, unsigned int y, unsigned z)
> +{
> +  return qux (x, y) >> (z % 32);
> +}
> +
> +void
> +garply (unsigned int *x, const unsigned int *y, unsigned z)
> +{
> +  x[0] = corge (y[0], y[1], z);
> +}
>
>         Jakub
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] i386: Add *concat<mode><dwi>3_{5,6,7} patterns [PR107627]
  2022-12-01  8:09 [PATCH] i386: Improve *concat<mode><dwi>3_{1,2,3,4} patterns [PR107627] Jakub Jelinek
  2022-12-01  8:23 ` Uros Bizjak
@ 2022-12-08  9:48 ` Jakub Jelinek
  2022-12-08 10:51   ` Uros Bizjak
  1 sibling, 1 reply; 4+ messages in thread
From: Jakub Jelinek @ 2022-12-08  9:48 UTC (permalink / raw)
  To: Uros Bizjak, Roger Sayle; +Cc: gcc-patches

Hi!

On Thu, Dec 01, 2022 at 09:09:51AM +0100, Jakub Jelinek via Gcc-patches wrote:
> BTW, I wonder if we couldn't add additional patterns which would catch
> the case where one of the operands is constant.

The following patch does add those.
The difference with the patch on the 2 testcases is:
 baz:
-	movq	8(%rsi), %rax
+	movq	8(%rsi), %rsi
+	movq	%rdi, %r8
 	movl	%edx, %ecx
-	xorl	%r8d, %r8d
-	xorl	%edx, %edx
-	movabsq	$-2401053089206453570, %r9
-	orq	%r8, %rax
-	orq	%r9, %rdx
-	shrdq	%rdx, %rax
-	movq	%rax, (%rdi)
+	movabsq	$-2401053089206453570, %rdi
+	movq	%rsi, %rax
+	shrdq	%rdi, %rax
+	movq	%rax, (%r8)
 qux:
-	movq	(%rsi), %rax
+	movq	%rdi, %r8
+	movq	(%rsi), %rdi
 	movl	%edx, %ecx
-	xorl	%r9d, %r9d
-	movabsq	$-2401053089206453570, %r8
-	movq	%rax, %rdx
-	xorl	%eax, %eax
-	orq	%r8, %rax
-	orq	%r9, %rdx
-	shrdq	%rdx, %rax
-	movq	%rax, (%rdi)
+	movabsq	$-2401053089206453570, %rsi
+	movq	%rsi, %rax
+	shrdq	%rdi, %rax
+	movq	%rax, (%r8)
and
 garply:
 	pushl	%esi
-	xorl	%edx, %edx
+	movl	$-559038737, %esi
 	pushl	%ebx
 	movl	16(%esp), %eax
-	orl	$-559038737, %edx
 	movl	20(%esp), %ecx
-	movl	4(%eax), %eax
-	shrdl	%edx, %eax
 	movl	12(%esp), %edx
+	movl	4(%eax), %ebx
+	movl	%ebx, %eax
+	shrdl	%esi, %eax
 fred:
...
 	movl	16(%esp), %eax
+	movl	$-889275714, %ebx
 	movl	20(%esp), %ecx
-	movl	(%eax), %eax
-	movl	%eax, %edx
-	movl	$0, %eax
-	orl	$-889275714, %eax
-	shrdl	%edx, %eax
 	movl	12(%esp), %edx
+	movl	(%eax), %esi
+	movl	%ebx, %eax
+	shrdl	%esi, %eax

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-12-08  Jakub Jelinek  <jakub@redhat.com>

	PR target/107627
	* config/i386/i386.md (HALF, half): New mode attributes.
	(*concat<half><mode>3_5, *concat<mode><dwi>3_6,
	*concat<mode><dwi>3_7): New define_insn_and_split patterns.

	* gcc.target/i386/pr107627-3.c: New test.
	* gcc.target/i386/pr107627-4.c: New test.

--- gcc/config/i386/i386.md.jj	2022-12-06 12:16:31.059905487 +0100
+++ gcc/config/i386/i386.md	2022-12-07 15:11:55.297914206 +0100
@@ -1134,6 +1134,10 @@ (define_mode_attr MODE_SIZE [(QI "1") (H
 (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
 (define_mode_attr dwi [(QI "hi") (HI "si") (SI "di") (DI "ti") (TI "oi")])
 
+;; Half sized integer modes.
+(define_mode_attr HALF [(TI "DI") (DI "SI")])
+(define_mode_attr half [(TI "di") (DI "si")])
+
 ;; LEA mode corresponding to an integer mode
 (define_mode_attr LEAMODE [(QI "SI") (HI "SI") (SI "SI") (DI "DI")])
 
@@ -11464,6 +11468,80 @@ (define_insn_and_split "*concat<mode><dw
   split_double_concat (<DWI>mode, operands[0], operands[1], operands[2]);
   DONE;
 })
+
+(define_insn_and_split "*concat<half><mode>3_5"
+  [(set (match_operand:DWI 0 "nonimmediate_operand" "=ro")
+	(any_or_plus:DWI
+	  (ashift:DWI (match_operand:DWI 1 "register_operand" "r")
+		      (match_operand:DWI 2 "const_int_operand"))
+	  (match_operand:DWI 3 "const_scalar_int_operand")))]
+  "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT / 2
+   && (<MODE>mode == DImode
+       ? CONST_INT_P (operands[3])
+	 && (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
+       : CONST_INT_P (operands[3])
+       ? INTVAL (operands[3]) >= 0
+       : CONST_WIDE_INT_NUNITS (operands[3]) == 2
+	 && CONST_WIDE_INT_ELT (operands[3], 1) == 0)"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
+{
+  rtx op3 = simplify_subreg (<HALF>mode, operands[3], <MODE>mode, 0);
+  split_double_concat (<MODE>mode, operands[0], op3,
+		       gen_lowpart (<HALF>mode, operands[1]));
+  DONE;
+})
+
+(define_insn_and_split "*concat<mode><dwi>3_6"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
+	(any_or_plus:<DWI>
+	  (ashift:<DWI>
+	    (zero_extend:<DWI>
+	      (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
+	    (match_operand:<DWI> 2 "const_int_operand"))
+	  (match_operand:<DWI> 3 "const_scalar_int_operand")))]
+  "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT
+   && (<DWI>mode == DImode
+       ? CONST_INT_P (operands[3])
+	 && (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
+       : CONST_INT_P (operands[3])
+       ? INTVAL (operands[3]) >= 0
+       : CONST_WIDE_INT_NUNITS (operands[3]) == 2
+	 && CONST_WIDE_INT_ELT (operands[3], 1) == 0)"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
+{
+  rtx op3 = simplify_subreg (<MODE>mode, operands[3], <DWI>mode, 0);
+  split_double_concat (<DWI>mode, operands[0], op3, operands[1]);
+  DONE;
+})
+
+(define_insn_and_split "*concat<mode><dwi>3_7"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
+	(any_or_plus:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
+	  (match_operand:<DWI> 2 "const_scalar_int_operand")))]
+  "<DWI>mode == DImode
+   ? CONST_INT_P (operands[2])
+     && (UINTVAL (operands[2]) & GET_MODE_MASK (SImode)) == 0
+   : CONST_WIDE_INT_P (operands[2])
+     && CONST_WIDE_INT_NUNITS (operands[2]) == 2
+     && CONST_WIDE_INT_ELT (operands[2], 0) == 0"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
+{
+  rtx op2;
+  if (<DWI>mode == DImode)
+    op2 = gen_int_mode (INTVAL (operands[2]) >> 32, <MODE>mode);
+  else
+    op2 = gen_int_mode (CONST_WIDE_INT_ELT (operands[2], 1), <MODE>mode);
+  split_double_concat (<DWI>mode, operands[0], operands[1], op2);
+  DONE;
+})
 \f
 ;; Negation instructions
 
--- gcc/testsuite/gcc.target/i386/pr107627-3.c.jj	2022-12-07 15:27:56.214095258 +0100
+++ gcc/testsuite/gcc.target/i386/pr107627-3.c	2022-12-07 13:22:44.728247774 +0100
@@ -0,0 +1,28 @@
+/* PR target/107627 */
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-not "\torq\t" } } */
+
+static inline unsigned __int128
+foo (unsigned long long x, unsigned long long y)
+{
+  return ((unsigned __int128) x << 64) | y;
+}
+
+static inline unsigned long long
+bar (unsigned long long x, unsigned long long y, unsigned z)
+{
+  return foo (x, y) >> (z % 64);
+}
+
+void
+baz (unsigned long long *x, const unsigned long long *y, unsigned z)
+{
+  x[0] = bar (0xdeadbeefcafebabeULL, y[1], z);
+}
+
+void
+qux (unsigned long long *x, const unsigned long long *y, unsigned z)
+{
+  x[0] = bar (y[0], 0xdeadbeefcafebabeULL, z);
+}
--- gcc/testsuite/gcc.target/i386/pr107627-4.c.jj	2022-12-07 15:27:59.519047759 +0100
+++ gcc/testsuite/gcc.target/i386/pr107627-4.c	2022-12-07 13:23:28.050621778 +0100
@@ -0,0 +1,28 @@
+/* PR target/107627 */
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-not "\torl\t" } } */
+
+static inline unsigned long long
+qux (unsigned int x, unsigned int y)
+{
+  return ((unsigned long long) x << 32) | y;
+}
+
+static inline unsigned int
+corge (unsigned int x, unsigned int y, unsigned z)
+{
+  return qux (x, y) >> (z % 32);
+}
+
+void
+garply (unsigned int *x, const unsigned int *y, unsigned z)
+{
+  x[0] = corge (0xdeadbeefU, y[1], z);
+}
+
+void
+fred (unsigned int *x, const unsigned int *y, unsigned z)
+{
+  x[0] = corge (y[0], 0xcafebabeU, z);
+}


	Jakub


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] i386: Add *concat<mode><dwi>3_{5,6,7} patterns [PR107627]
  2022-12-08  9:48 ` [PATCH] i386: Add *concat<mode><dwi>3_{5,6,7} " Jakub Jelinek
@ 2022-12-08 10:51   ` Uros Bizjak
  0 siblings, 0 replies; 4+ messages in thread
From: Uros Bizjak @ 2022-12-08 10:51 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches

On Thu, Dec 8, 2022 at 10:49 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> On Thu, Dec 01, 2022 at 09:09:51AM +0100, Jakub Jelinek via Gcc-patches wrote:
> > BTW, I wonder if we couldn't add additional patterns which would catch
> > the case where one of the operands is constant.
>
> The following patch does add those.
> The difference with the patch on the 2 testcases is:
>  baz:
> -       movq    8(%rsi), %rax
> +       movq    8(%rsi), %rsi
> +       movq    %rdi, %r8
>         movl    %edx, %ecx
> -       xorl    %r8d, %r8d
> -       xorl    %edx, %edx
> -       movabsq $-2401053089206453570, %r9
> -       orq     %r8, %rax
> -       orq     %r9, %rdx
> -       shrdq   %rdx, %rax
> -       movq    %rax, (%rdi)
> +       movabsq $-2401053089206453570, %rdi
> +       movq    %rsi, %rax
> +       shrdq   %rdi, %rax
> +       movq    %rax, (%r8)
>  qux:
> -       movq    (%rsi), %rax
> +       movq    %rdi, %r8
> +       movq    (%rsi), %rdi
>         movl    %edx, %ecx
> -       xorl    %r9d, %r9d
> -       movabsq $-2401053089206453570, %r8
> -       movq    %rax, %rdx
> -       xorl    %eax, %eax
> -       orq     %r8, %rax
> -       orq     %r9, %rdx
> -       shrdq   %rdx, %rax
> -       movq    %rax, (%rdi)
> +       movabsq $-2401053089206453570, %rsi
> +       movq    %rsi, %rax
> +       shrdq   %rdi, %rax
> +       movq    %rax, (%r8)
> and
>  garply:
>         pushl   %esi
> -       xorl    %edx, %edx
> +       movl    $-559038737, %esi
>         pushl   %ebx
>         movl    16(%esp), %eax
> -       orl     $-559038737, %edx
>         movl    20(%esp), %ecx
> -       movl    4(%eax), %eax
> -       shrdl   %edx, %eax
>         movl    12(%esp), %edx
> +       movl    4(%eax), %ebx
> +       movl    %ebx, %eax
> +       shrdl   %esi, %eax
>  fred:
> ...
>         movl    16(%esp), %eax
> +       movl    $-889275714, %ebx
>         movl    20(%esp), %ecx
> -       movl    (%eax), %eax
> -       movl    %eax, %edx
> -       movl    $0, %eax
> -       orl     $-889275714, %eax
> -       shrdl   %edx, %eax
>         movl    12(%esp), %edx
> +       movl    (%eax), %esi
> +       movl    %ebx, %eax
> +       shrdl   %esi, %eax
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2022-12-08  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/107627
>         * config/i386/i386.md (HALF, half): New mode attributes.
>         (*concat<half><mode>3_5, *concat<mode><dwi>3_6,
>         *concat<mode><dwi>3_7): New define_insn_and_split patterns.
>
>         * gcc.target/i386/pr107627-3.c: New test.
>         * gcc.target/i386/pr107627-4.c: New test.

LGTM.

Thanks,
Uros.

> --- gcc/config/i386/i386.md.jj  2022-12-06 12:16:31.059905487 +0100
> +++ gcc/config/i386/i386.md     2022-12-07 15:11:55.297914206 +0100
> @@ -1134,6 +1134,10 @@ (define_mode_attr MODE_SIZE [(QI "1") (H
>  (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
>  (define_mode_attr dwi [(QI "hi") (HI "si") (SI "di") (DI "ti") (TI "oi")])
>
> +;; Half sized integer modes.
> +(define_mode_attr HALF [(TI "DI") (DI "SI")])
> +(define_mode_attr half [(TI "di") (DI "si")])
> +
>  ;; LEA mode corresponding to an integer mode
>  (define_mode_attr LEAMODE [(QI "SI") (HI "SI") (SI "SI") (DI "DI")])
>
> @@ -11464,6 +11468,80 @@ (define_insn_and_split "*concat<mode><dw
>    split_double_concat (<DWI>mode, operands[0], operands[1], operands[2]);
>    DONE;
>  })
> +
> +(define_insn_and_split "*concat<half><mode>3_5"
> +  [(set (match_operand:DWI 0 "nonimmediate_operand" "=ro")
> +       (any_or_plus:DWI
> +         (ashift:DWI (match_operand:DWI 1 "register_operand" "r")
> +                     (match_operand:DWI 2 "const_int_operand"))
> +         (match_operand:DWI 3 "const_scalar_int_operand")))]
> +  "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT / 2
> +   && (<MODE>mode == DImode
> +       ? CONST_INT_P (operands[3])
> +        && (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
> +       : CONST_INT_P (operands[3])
> +       ? INTVAL (operands[3]) >= 0
> +       : CONST_WIDE_INT_NUNITS (operands[3]) == 2
> +        && CONST_WIDE_INT_ELT (operands[3], 1) == 0)"
> +  "#"
> +  "&& reload_completed"
> +  [(clobber (const_int 0))]
> +{
> +  rtx op3 = simplify_subreg (<HALF>mode, operands[3], <MODE>mode, 0);
> +  split_double_concat (<MODE>mode, operands[0], op3,
> +                      gen_lowpart (<HALF>mode, operands[1]));
> +  DONE;
> +})
> +
> +(define_insn_and_split "*concat<mode><dwi>3_6"
> +  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
> +       (any_or_plus:<DWI>
> +         (ashift:<DWI>
> +           (zero_extend:<DWI>
> +             (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
> +           (match_operand:<DWI> 2 "const_int_operand"))
> +         (match_operand:<DWI> 3 "const_scalar_int_operand")))]
> +  "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT
> +   && (<DWI>mode == DImode
> +       ? CONST_INT_P (operands[3])
> +        && (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
> +       : CONST_INT_P (operands[3])
> +       ? INTVAL (operands[3]) >= 0
> +       : CONST_WIDE_INT_NUNITS (operands[3]) == 2
> +        && CONST_WIDE_INT_ELT (operands[3], 1) == 0)"
> +  "#"
> +  "&& reload_completed"
> +  [(clobber (const_int 0))]
> +{
> +  rtx op3 = simplify_subreg (<MODE>mode, operands[3], <DWI>mode, 0);
> +  split_double_concat (<DWI>mode, operands[0], op3, operands[1]);
> +  DONE;
> +})
> +
> +(define_insn_and_split "*concat<mode><dwi>3_7"
> +  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
> +       (any_or_plus:<DWI>
> +         (zero_extend:<DWI>
> +           (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
> +         (match_operand:<DWI> 2 "const_scalar_int_operand")))]
> +  "<DWI>mode == DImode
> +   ? CONST_INT_P (operands[2])
> +     && (UINTVAL (operands[2]) & GET_MODE_MASK (SImode)) == 0
> +   : CONST_WIDE_INT_P (operands[2])
> +     && CONST_WIDE_INT_NUNITS (operands[2]) == 2
> +     && CONST_WIDE_INT_ELT (operands[2], 0) == 0"
> +  "#"
> +  "&& reload_completed"
> +  [(clobber (const_int 0))]
> +{
> +  rtx op2;
> +  if (<DWI>mode == DImode)
> +    op2 = gen_int_mode (INTVAL (operands[2]) >> 32, <MODE>mode);
> +  else
> +    op2 = gen_int_mode (CONST_WIDE_INT_ELT (operands[2], 1), <MODE>mode);
> +  split_double_concat (<DWI>mode, operands[0], operands[1], op2);
> +  DONE;
> +})
>
>  ;; Negation instructions
>
> --- gcc/testsuite/gcc.target/i386/pr107627-3.c.jj       2022-12-07 15:27:56.214095258 +0100
> +++ gcc/testsuite/gcc.target/i386/pr107627-3.c  2022-12-07 13:22:44.728247774 +0100
> @@ -0,0 +1,28 @@
> +/* PR target/107627 */
> +/* { dg-do compile { target int128 } } */
> +/* { dg-options "-O2 -masm=att" } */
> +/* { dg-final { scan-assembler-not "\torq\t" } } */
> +
> +static inline unsigned __int128
> +foo (unsigned long long x, unsigned long long y)
> +{
> +  return ((unsigned __int128) x << 64) | y;
> +}
> +
> +static inline unsigned long long
> +bar (unsigned long long x, unsigned long long y, unsigned z)
> +{
> +  return foo (x, y) >> (z % 64);
> +}
> +
> +void
> +baz (unsigned long long *x, const unsigned long long *y, unsigned z)
> +{
> +  x[0] = bar (0xdeadbeefcafebabeULL, y[1], z);
> +}
> +
> +void
> +qux (unsigned long long *x, const unsigned long long *y, unsigned z)
> +{
> +  x[0] = bar (y[0], 0xdeadbeefcafebabeULL, z);
> +}
> --- gcc/testsuite/gcc.target/i386/pr107627-4.c.jj       2022-12-07 15:27:59.519047759 +0100
> +++ gcc/testsuite/gcc.target/i386/pr107627-4.c  2022-12-07 13:23:28.050621778 +0100
> @@ -0,0 +1,28 @@
> +/* PR target/107627 */
> +/* { dg-do compile { target ia32 } } */
> +/* { dg-options "-O2 -masm=att" } */
> +/* { dg-final { scan-assembler-not "\torl\t" } } */
> +
> +static inline unsigned long long
> +qux (unsigned int x, unsigned int y)
> +{
> +  return ((unsigned long long) x << 32) | y;
> +}
> +
> +static inline unsigned int
> +corge (unsigned int x, unsigned int y, unsigned z)
> +{
> +  return qux (x, y) >> (z % 32);
> +}
> +
> +void
> +garply (unsigned int *x, const unsigned int *y, unsigned z)
> +{
> +  x[0] = corge (0xdeadbeefU, y[1], z);
> +}
> +
> +void
> +fred (unsigned int *x, const unsigned int *y, unsigned z)
> +{
> +  x[0] = corge (y[0], 0xcafebabeU, z);
> +}
>
>
>         Jakub
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-12-08 10:52 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-01  8:09 [PATCH] i386: Improve *concat<mode><dwi>3_{1,2,3,4} patterns [PR107627] Jakub Jelinek
2022-12-01  8:23 ` Uros Bizjak
2022-12-08  9:48 ` [PATCH] i386: Add *concat<mode><dwi>3_{5,6,7} " Jakub Jelinek
2022-12-08 10:51   ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).