[PATCH] x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI.
@ 2021-12-13 14:10 Roger Sayle
  2021-12-14 10:34 ` Richard Sandiford
  2021-12-15  9:01 ` Uros Bizjak
  0 siblings, 2 replies; 3+ messages in thread
From: Roger Sayle @ 2021-12-13 14:10 UTC (permalink / raw)
  To: 'GCC Patches'

[-- Attachment #1: Type: text/plain, Size: 2786 bytes --]


A common idiom is to create a DImode value from the "concat" of two SImode
values, using "(long long)hi << 32 | (long long)lo", where the operation
may be ior, xor or plus.  On x86, with -m32, the high and low parts of
a DImode register are actually different SImode registers (typically %edx
and %eax) so ideally this idiom should reduce to two move instructions
(or optimally, just clever register allocation).

Unfortunately, GCC currently performs the IOR operation above on -m32,
and worse allocates DImode registers (split to SImode register pairs)
for both the zero extended HI and LO values.

Hence, for test1 from the new test case below:

typedef int __v4si __attribute__ ((__vector_size__ (16)));
long long test1(__v4si v) {
  unsigned int loVal = (unsigned int)v[0];
  unsigned int hiVal = (unsigned int)v[1];
  return (long long)(loVal) | ((long long)(hiVal) << 32);
}

we currently generate (with -m32 -O2 -msse4.1):

test1:  subl    $28, %esp
        pextrd  $1, %xmm0, %eax
        pmovzxdq        %xmm0, %xmm1
        movq    %xmm1, 8(%esp)
        movl    %eax, %edx
        movl    8(%esp), %eax
        orl     12(%esp), %edx
        addl    $28, %esp
        orb     $0, %ah
        ret

with this patch we now generate:

test1:  pextrd  $1, %xmm0, %edx
        movd    %xmm0, %eax
        ret

The fix is to recognize and split the idiom (hi<<32)|zext(lo) prior
to register allocation on !TARGET_64BIT, simplifying this sequence to
"highpart(dst) = hi; lowpart(dst) = lo".

The one minor complication is that sse.md's define_insn for
*vec_extractv4si_0_zext_sse4 can sometimes interfere with this
optimization.  It turns out that on !TARGET_64BIT, the zero_extend:DI
following vec_select:SI isn't free, and this insn gets split back
into multiple instructions during later passes, but too late to
be optimized away by this patch/reload.  Hence the last hunk of
this patch is to restrict *vec_extractv4si_0_zext_sse4 to TARGET_64BIT.
Checking PR target/80286, where *vec_extractv4si_0_zext_sse4 was
first added, this seems reasonable (but this patch has been tested
both with and without this last change, if it's consider controversial).

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without "--target_board='unix{-m32}'"
with no new failures.  OK for mainline?


2021-12-13  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR target/103611
	* config/i386/i386.md (any_or_plus): New code iterator.
	(define_split): Split (HI<<32)|zext(LO) into piece-wise
	move instructions on !TARGET_64BIT.
	* config/i386/sse.md (*vec_extractv4si_0_zext_sse4):
	Restrict to TARGET_64BIT.

gcc/testsuite/ChangeLog
	PR target/103611
	* gcc.target/i386/pr103611-2.c: New test case.


Thanks in advance,
Roger
--


[-- Attachment #2: patchq23b.txt --]
[-- Type: text/plain, Size: 3500 bytes --]

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9d7d116..8ecf169 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -10620,6 +10620,38 @@
   [(set_attr "isa" "*,nox64")
    (set_attr "type" "alu")
    (set_attr "mode" "QI")])
+
+;; Split DST = (HI<<32)|LO early to minimize register usage.
+(define_code_iterator any_or_plus [plus ior xor])
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(any_or_plus:DI
+	  (ashift:DI (match_operand:DI 1 "register_operand")
+		     (const_int 32))
+	  (zero_extend:DI (match_operand:SI 2 "register_operand"))))]
+  "!TARGET_64BIT"
+  [(set (match_dup 3) (match_dup 4))
+   (set (match_dup 5) (match_dup 2))]
+{
+  operands[3] = gen_highpart (SImode, operands[0]);
+  operands[4] = gen_lowpart (SImode, operands[1]);
+  operands[5] = gen_lowpart (SImode, operands[0]);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(any_or_plus:DI
+	  (zero_extend:DI (match_operand:SI 1 "register_operand"))
+	  (ashift:DI (match_operand:DI 2 "register_operand")
+		     (const_int 32))))]
+  "!TARGET_64BIT"
+  [(set (match_dup 3) (match_dup 4))
+   (set (match_dup 5) (match_dup 1))]
+{
+  operands[3] = gen_highpart (SImode, operands[0]);
+  operands[4] = gen_lowpart (SImode, operands[2]);
+  operands[5] = gen_lowpart (SImode, operands[0]);
+})
 \f
 ;; Negation instructions
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5421fb5..fba0250 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18700,7 +18700,7 @@
 	  (vec_select:SI
 	    (match_operand:V4SI 1 "register_operand" "v,x,v")
 	    (parallel [(const_int 0)]))))]
-  "TARGET_SSE4_1"
+  "TARGET_64BIT && TARGET_SSE4_1"
   "#"
   [(set_attr "isa" "x64,*,avx512f")
    (set (attr "preferred_for_speed")
diff --git a/gcc/testsuite/gcc.target/i386/pr103611-2.c b/gcc/testsuite/gcc.target/i386/pr103611-2.c
new file mode 100644
index 0000000..1555e99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103611-2.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-m32 -O2 -msse4" } */
+typedef int __v4si __attribute__ ((__vector_size__ (16)));
+
+long long test1(__v4si v) {
+  unsigned int loVal = (unsigned int)v[0];
+  unsigned int hiVal = (unsigned int)v[1];
+  return (long long)(loVal) | ((long long)(hiVal) << 32);
+}
+
+long long test2(__v4si v) {
+  unsigned int loVal = (unsigned int)v[2];
+  unsigned int hiVal = (unsigned int)v[3];
+  return (long long)(loVal) | ((long long)(hiVal) << 32);
+}
+
+long long test3(__v4si v) {
+  unsigned int loVal = (unsigned int)v[0];
+  unsigned int hiVal = (unsigned int)v[1];
+  return (long long)(loVal) ^ ((long long)(hiVal) << 32);
+}
+
+long long test4(__v4si v) {
+  unsigned int loVal = (unsigned int)v[2];
+  unsigned int hiVal = (unsigned int)v[3];
+  return (long long)(loVal) ^ ((long long)(hiVal) << 32);
+}
+
+long long test5(__v4si v) {
+  unsigned int loVal = (unsigned int)v[0];
+  unsigned int hiVal = (unsigned int)v[1];
+  return (long long)(loVal) + ((long long)(hiVal) << 32);
+}
+
+long long test6(__v4si v) {
+  unsigned int loVal = (unsigned int)v[2];
+  unsigned int hiVal = (unsigned int)v[3];
+  return (long long)(loVal) + ((long long)(hiVal) << 32);
+}
+
+/* { dg-final { scan-assembler-not "\tor" } } */
+/* { dg-final { scan-assembler-not "\txor" } } */
+/* { dg-final { scan-assembler-not "\tadd" } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI.
  2021-12-13 14:10 [PATCH] x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI Roger Sayle
@ 2021-12-14 10:34 ` Richard Sandiford
  2021-12-15  9:01 ` Uros Bizjak
  1 sibling, 0 replies; 3+ messages in thread
From: Richard Sandiford @ 2021-12-14 10:34 UTC (permalink / raw)
  To: Roger Sayle; +Cc: 'GCC Patches'

"Roger Sayle" <roger@nextmovesoftware.com> writes:
> A common idiom is to create a DImode value from the "concat" of two SImode
> values, using "(long long)hi << 32 | (long long)lo", where the operation
> may be ior, xor or plus.  On x86, with -m32, the high and low parts of
> a DImode register are actually different SImode registers (typically %edx
> and %eax) so ideally this idiom should reduce to two move instructions
> (or optimally, just clever register allocation).
>
> Unfortunately, GCC currently performs the IOR operation above on -m32,
> and worse allocates DImode registers (split to SImode register pairs)
> for both the zero extended HI and LO values.
>
> Hence, for test1 from the new test case below:
>
> typedef int __v4si __attribute__ ((__vector_size__ (16)));
> long long test1(__v4si v) {
>   unsigned int loVal = (unsigned int)v[0];
>   unsigned int hiVal = (unsigned int)v[1];
>   return (long long)(loVal) | ((long long)(hiVal) << 32);
> }
>
> we currently generate (with -m32 -O2 -msse4.1):
>
> test1:  subl    $28, %esp
>         pextrd  $1, %xmm0, %eax
>         pmovzxdq        %xmm0, %xmm1
>         movq    %xmm1, 8(%esp)
>         movl    %eax, %edx
>         movl    8(%esp), %eax
>         orl     12(%esp), %edx
>         addl    $28, %esp
>         orb     $0, %ah
>         ret
>
> with this patch we now generate:
>
> test1:  pextrd  $1, %xmm0, %edx
>         movd    %xmm0, %eax
>         ret
>
> The fix is to recognize and split the idiom (hi<<32)|zext(lo) prior
> to register allocation on !TARGET_64BIT, simplifying this sequence to
> "highpart(dst) = hi; lowpart(dst) = lo".
>
> The one minor complication is that sse.md's define_insn for
> *vec_extractv4si_0_zext_sse4 can sometimes interfere with this
> optimization.  It turns out that on !TARGET_64BIT, the zero_extend:DI
> following vec_select:SI isn't free, and this insn gets split back
> into multiple instructions during later passes, but too late to
> be optimized away by this patch/reload.  Hence the last hunk of
> this patch is to restrict *vec_extractv4si_0_zext_sse4 to TARGET_64BIT.
> Checking PR target/80286, where *vec_extractv4si_0_zext_sse4 was
> first added, this seems reasonable (but this patch has been tested
> both with and without this last change, if it's consider controversial).
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without "--target_board='unix{-m32}'"
> with no new failures.  OK for mainline?

To play devil's advocate: does this optimisation belong in
target-specific code?  It feels pretty generic and could be keyed
off BITS_PER_WORD.

Thanks,
Richard

>
>
> 2021-12-13  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> 	PR target/103611
> 	* config/i386/i386.md (any_or_plus): New code iterator.
> 	(define_split): Split (HI<<32)|zext(LO) into piece-wise
> 	move instructions on !TARGET_64BIT.
> 	* config/i386/sse.md (*vec_extractv4si_0_zext_sse4):
> 	Restrict to TARGET_64BIT.
>
> gcc/testsuite/ChangeLog
> 	PR target/103611
> 	* gcc.target/i386/pr103611-2.c: New test case.
>
>
> Thanks in advance,
> Roger
> --

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI.
  2021-12-13 14:10 [PATCH] x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI Roger Sayle
  2021-12-14 10:34 ` Richard Sandiford
@ 2021-12-15  9:01 ` Uros Bizjak
  1 sibling, 0 replies; 3+ messages in thread
From: Uros Bizjak @ 2021-12-15  9:01 UTC (permalink / raw)
  To: Roger Sayle; +Cc: GCC Patches

On Mon, Dec 13, 2021 at 3:10 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> A common idiom is to create a DImode value from the "concat" of two SImode
> values, using "(long long)hi << 32 | (long long)lo", where the operation
> may be ior, xor or plus.  On x86, with -m32, the high and low parts of
> a DImode register are actually different SImode registers (typically %edx
> and %eax) so ideally this idiom should reduce to two move instructions
> (or optimally, just clever register allocation).
>
> Unfortunately, GCC currently performs the IOR operation above on -m32,
> and worse allocates DImode registers (split to SImode register pairs)
> for both the zero extended HI and LO values.
>
> Hence, for test1 from the new test case below:
>
> typedef int __v4si __attribute__ ((__vector_size__ (16)));
> long long test1(__v4si v) {
>   unsigned int loVal = (unsigned int)v[0];
>   unsigned int hiVal = (unsigned int)v[1];
>   return (long long)(loVal) | ((long long)(hiVal) << 32);
> }
>
> we currently generate (with -m32 -O2 -msse4.1):
>
> test1:  subl    $28, %esp
>         pextrd  $1, %xmm0, %eax
>         pmovzxdq        %xmm0, %xmm1
>         movq    %xmm1, 8(%esp)
>         movl    %eax, %edx
>         movl    8(%esp), %eax
>         orl     12(%esp), %edx
>         addl    $28, %esp
>         orb     $0, %ah
>         ret
>
> with this patch we now generate:
>
> test1:  pextrd  $1, %xmm0, %edx
>         movd    %xmm0, %eax
>         ret
>
> The fix is to recognize and split the idiom (hi<<32)|zext(lo) prior
> to register allocation on !TARGET_64BIT, simplifying this sequence to
> "highpart(dst) = hi; lowpart(dst) = lo".
>
> The one minor complication is that sse.md's define_insn for
> *vec_extractv4si_0_zext_sse4 can sometimes interfere with this
> optimization.  It turns out that on !TARGET_64BIT, the zero_extend:DI
> following vec_select:SI isn't free, and this insn gets split back
> into multiple instructions during later passes, but too late to
> be optimized away by this patch/reload.  Hence the last hunk of
> this patch is to restrict *vec_extractv4si_0_zext_sse4 to TARGET_64BIT.
> Checking PR target/80286, where *vec_extractv4si_0_zext_sse4 was
> first added, this seems reasonable (but this patch has been tested
> both with and without this last change, if it's consider controversial).
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without "--target_board='unix{-m32}'"
> with no new failures.  OK for mainline?
>
>
> 2021-12-13  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         PR target/103611
>         * config/i386/i386.md (any_or_plus): New code iterator.
>         (define_split): Split (HI<<32)|zext(LO) into piece-wise
>         move instructions on !TARGET_64BIT.
>         * config/i386/sse.md (*vec_extractv4si_0_zext_sse4):
>         Restrict to TARGET_64BIT.
>
> gcc/testsuite/ChangeLog
>         PR target/103611
>         * gcc.target/i386/pr103611-2.c: New test case.

OK with *vec_extractv4si_0_zext_sse4 change but please also change isa
attribute to:

  [(set_attr "isa" "*,*,avx512f")

Thanks,
Uros.

>
> Thanks in advance,
> Roger
> --
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-12-15  9:01 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-13 14:10 [PATCH] x86: PR target/103611: Splitter for DST:DI = (HI:SI<<32)|LO:SI Roger Sayle
2021-12-14 10:34 ` Richard Sandiford
2021-12-15  9:01 ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).