RE: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* RE: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
@ 2022-04-20 18:28 Roger Sayle
  2022-04-21  9:30 ` Uros Bizjak
  0 siblings, 1 reply; 3+ messages in thread
From: Roger Sayle @ 2022-04-20 18:28 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4026 bytes --]


Doh! ENOPATCH.

> -----Original Message-----
> From: Roger Sayle <roger@nextmovesoftware.com>
> Sent: 20 April 2022 18:50
> To: 'gcc-patches@gcc.gnu.org' <gcc-patches@gcc.gnu.org>
> Subject: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
> 
> 
> This patch (for when the compiler returns to stage 1) improves support for
> vector equality and inequality of V1TImode vectors, and V2DImode vectors
with
> sse2 but not sse4.  Consider the three functions below:
> 
> typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); typedef
> unsigned long long uv2di __attribute__ ((__vector_size__ (16))); typedef
> unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
> 
> uv4si eq_v4si(uv4si x, uv4si y) { return x == y; } uv2di eq_v2di(uv2di x,
uv2di y) {
> return x == y; } uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
> 
> These all perform vector comparisons of 128bit SSE2 registers, generating
the
> result as a vector, where ~0 (all 1 bits) represents true and a zero
represents
> false.  eq_v4si is trivially implemented by x86_64's pcmpeqd instruction.
This
> patch improves the other two cases:
> 
> For v2di, gcc -O2 currently generates:
> 
>         movq    %xmm0, %rdx
>         movq    %xmm1, %rax
>         movdqa  %xmm0, %xmm2
>         cmpq    %rax, %rdx
>         movhlps %xmm2, %xmm3
>         movhlps %xmm1, %xmm4
>         sete    %al
>         movq    %xmm3, %rdx
>         movzbl  %al, %eax
>         negq    %rax
>         movq    %rax, %xmm0
>         movq    %xmm4, %rax
>         cmpq    %rax, %rdx
>         sete    %al
>         movzbl  %al, %eax
>         negq    %rax
>         movq    %rax, %xmm5
>         punpcklqdq      %xmm5, %xmm0
>         ret
> 
> but with this patch we now generate:
> 
>         pcmpeqd %xmm0, %xmm1
>         pshufd  $177, %xmm1, %xmm0
>         pand    %xmm1, %xmm0
>         ret
> 
> where the results of a V4SI comparison are shuffled and bit-wise ANDed to
> produce the desired result.  There's no change in the code generated for
"-O2 -
> msse4" where the compiler generates a single "pcmpeqq" insn.
> 
> For V1TI mode, the results are equally dramatic, where the current -O2
output
> looks like:
> 
>         movaps  %xmm0, -40(%rsp)
>         movq    -40(%rsp), %rax
>         movq    -32(%rsp), %rdx
>         movaps  %xmm1, -24(%rsp)
>         movq    -24(%rsp), %rcx
>         movq    -16(%rsp), %rsi
>         xorq    %rcx, %rax
>         xorq    %rsi, %rdx
>         orq     %rdx, %rax
>         sete    %al
>         xorl    %edx, %edx
>         movzbl  %al, %eax
>         negq    %rax
>         adcq    $0, %rdx
>         movq    %rax, %xmm2
>         negq    %rdx
>         movq    %rdx, -40(%rsp)
>         movhps  -40(%rsp), %xmm2
>         movdqa  %xmm2, %xmm0
>         ret
> 
> with this patch we now generate:
> 
>         pcmpeqd %xmm0, %xmm1
>         pshufd  $177, %xmm1, %xmm0
>         pand    %xmm1, %xmm0
>         pshufd  $78, %xmm0, %xmm1
>         pand    %xmm1, %xmm0
>         ret
> 
> performing a V2DI comparison, followed by a shuffle and pand, and with
> -O2 -msse4 take advantages of SSE4.1's pcmpeqq:
> 
>         pcmpeqq %xmm0, %xmm1
>         pshufd  $78, %xmm1, %xmm0
>         pand    %xmm1, %xmm0
>         ret
> 
> 
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
> make -k check, both with and without --target_board=unix{-m32}, with no
new
> failures.  Is this OK for when we return to stage 1?
> 
> 
> 2022-04-20  Roger Sayle  <roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
> 	* config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
> 	For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
> 	by a pshufd and pand.
> 	(vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
> 	vector equality as a V2DImode vector comparison (see above),
> 	followed by a pshufd and pand.
> 
> gcc/testsuite/ChangeLog
> 	* gcc.target/i386/sse2-v1ti-veq.c: New test case.
> 	* gcc.target/i386/sse2-v1ti-vne.c: New test case.
> 
> 
> Roger
> --


[-- Attachment #2: patchvc.txt --]
[-- Type: text/plain, Size: 3894 bytes --]

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a852c16..9bc8fb0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4379,13 +4379,57 @@
 	(match_operator:V2DI 1 ""
 	  [(match_operand:V2DI 2 "register_operand")
 	   (match_operand:V2DI 3 "vector_operand")]))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE2"
 {
-  bool ok = ix86_expand_int_vec_cmp (operands);
+  bool ok;
+  if (!TARGET_SSE4_1)
+    {
+      rtx ops[4];
+      ops[0] = gen_reg_rtx (V4SImode);
+      ops[2] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[2]));
+      ops[3] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[3]));
+      ops[1] = gen_rtx_fmt_ee (GET_CODE (operands[1]), V4SImode,
+			       ops[2], ops[3]);
+      ok = ix86_expand_int_vec_cmp (ops);
+
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_pshufd (tmp1, ops[0], GEN_INT (0xb1)));
+
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_andv4si3 (tmp2, tmp1, ops[0]));
+
+      emit_move_insn (operands[0], gen_lowpart (V2DImode, tmp2));
+    }
+  else
+    ok = ix86_expand_int_vec_cmp (operands);
   gcc_assert (ok);
   DONE;
 })
 
+(define_expand "vec_cmpeqv1tiv1ti"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(match_operator:V1TI 1 ""
+	  [(match_operand:V1TI 2 "register_operand")
+	   (match_operand:V1TI 3 "vector_operand")]))]
+  "TARGET_SSE2"
+{
+  rtx dst = gen_reg_rtx (V2DImode);
+  rtx op1 = force_reg (V2DImode, gen_lowpart (V2DImode, operands[2]));
+  rtx op2 = force_reg (V2DImode, gen_lowpart (V2DImode, operands[3]));
+  rtx cmp = gen_rtx_fmt_ee (GET_CODE (operands[1]), V2DImode, op1, op2);
+  emit_insn (gen_vec_cmpeqv2div2di (dst, cmp, op1, op2));
+
+  rtx tmp1 = gen_reg_rtx (V4SImode);
+  rtx tmp2 = force_reg (V4SImode, gen_lowpart (V4SImode, dst));
+  emit_insn (gen_sse2_pshufd (tmp1, tmp2, GEN_INT (0x4e)));
+
+  rtx tmp3 = gen_reg_rtx (V4SImode);
+  emit_insn (gen_andv4si3 (tmp3, tmp2, tmp1));
+
+  emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
+  DONE;
+})
+
 (define_expand "vcond<V_512:mode><VF_512:mode>"
   [(set (match_operand:V_512 0 "register_operand")
 	(if_then_else:V_512
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c
new file mode 100644
index 0000000..8bbda06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2" } */
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
+
+uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
+uv2di eq_v2di(uv2di x, uv2di y) { return x == y; }
+uv4si eq_v4si(uv4si x, uv4si y) { return x == y; }
+
+/* { dg-final { scan-assembler-times "pcmpeq" 3 } } */
+/* { dg-final { scan-assembler "pshufd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
new file mode 100644
index 0000000..cb47147
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2" } */
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
+
+uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x != y; }
+uv2di eq_v2di(uv2di x, uv2di y) { return x != y; }
+uv4si eq_v4si(uv4si x, uv4si y) { return x != y; }
+
+/* { dg-final { scan-assembler-times "pcmpeq" 6 } } */
+/* { dg-final { scan-assembler-times "pxor" 3 } } */
+/* { dg-final { scan-assembler "pshufd" } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
  2022-04-20 18:28 [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality Roger Sayle
@ 2022-04-21  9:30 ` Uros Bizjak
  0 siblings, 0 replies; 3+ messages in thread
From: Uros Bizjak @ 2022-04-21  9:30 UTC (permalink / raw)
  To: Roger Sayle; +Cc: gcc-patches

On Wed, Apr 20, 2022 at 8:28 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> Doh! ENOPATCH.
>
> > -----Original Message-----
> > From: Roger Sayle <roger@nextmovesoftware.com>
> > Sent: 20 April 2022 18:50
> > To: 'gcc-patches@gcc.gnu.org' <gcc-patches@gcc.gnu.org>
> > Subject: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
> >
> >
> > This patch (for when the compiler returns to stage 1) improves support for
> > vector equality and inequality of V1TImode vectors, and V2DImode vectors
> with
> > sse2 but not sse4.  Consider the three functions below:
> >
> > typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); typedef
> > unsigned long long uv2di __attribute__ ((__vector_size__ (16))); typedef
> > unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
> >
> > uv4si eq_v4si(uv4si x, uv4si y) { return x == y; } uv2di eq_v2di(uv2di x,
> uv2di y) {
> > return x == y; } uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
> >
> > These all perform vector comparisons of 128bit SSE2 registers, generating
> the
> > result as a vector, where ~0 (all 1 bits) represents true and a zero
> represents
> > false.  eq_v4si is trivially implemented by x86_64's pcmpeqd instruction.
> This
> > patch improves the other two cases:
> >
> > For v2di, gcc -O2 currently generates:
> >
> >         movq    %xmm0, %rdx
> >         movq    %xmm1, %rax
> >         movdqa  %xmm0, %xmm2
> >         cmpq    %rax, %rdx
> >         movhlps %xmm2, %xmm3
> >         movhlps %xmm1, %xmm4
> >         sete    %al
> >         movq    %xmm3, %rdx
> >         movzbl  %al, %eax
> >         negq    %rax
> >         movq    %rax, %xmm0
> >         movq    %xmm4, %rax
> >         cmpq    %rax, %rdx
> >         sete    %al
> >         movzbl  %al, %eax
> >         negq    %rax
> >         movq    %rax, %xmm5
> >         punpcklqdq      %xmm5, %xmm0
> >         ret
> >
> > but with this patch we now generate:
> >
> >         pcmpeqd %xmm0, %xmm1
> >         pshufd  $177, %xmm1, %xmm0
> >         pand    %xmm1, %xmm0
> >         ret
> >
> > where the results of a V4SI comparison are shuffled and bit-wise ANDed to
> > produce the desired result.  There's no change in the code generated for
> "-O2 -
> > msse4" where the compiler generates a single "pcmpeqq" insn.
> >
> > For V1TI mode, the results are equally dramatic, where the current -O2
> output
> > looks like:
> >
> >         movaps  %xmm0, -40(%rsp)
> >         movq    -40(%rsp), %rax
> >         movq    -32(%rsp), %rdx
> >         movaps  %xmm1, -24(%rsp)
> >         movq    -24(%rsp), %rcx
> >         movq    -16(%rsp), %rsi
> >         xorq    %rcx, %rax
> >         xorq    %rsi, %rdx
> >         orq     %rdx, %rax
> >         sete    %al
> >         xorl    %edx, %edx
> >         movzbl  %al, %eax
> >         negq    %rax
> >         adcq    $0, %rdx
> >         movq    %rax, %xmm2
> >         negq    %rdx
> >         movq    %rdx, -40(%rsp)
> >         movhps  -40(%rsp), %xmm2
> >         movdqa  %xmm2, %xmm0
> >         ret
> >
> > with this patch we now generate:
> >
> >         pcmpeqd %xmm0, %xmm1
> >         pshufd  $177, %xmm1, %xmm0
> >         pand    %xmm1, %xmm0
> >         pshufd  $78, %xmm0, %xmm1
> >         pand    %xmm1, %xmm0
> >         ret
> >
> > performing a V2DI comparison, followed by a shuffle and pand, and with
> > -O2 -msse4 take advantages of SSE4.1's pcmpeqq:
> >
> >         pcmpeqq %xmm0, %xmm1
> >         pshufd  $78, %xmm1, %xmm0
> >         pand    %xmm1, %xmm0
> >         ret
> >
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
> > make -k check, both with and without --target_board=unix{-m32}, with no
> new
> > failures.  Is this OK for when we return to stage 1?
> >
> >
> > 2022-04-20  Roger Sayle  <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >       * config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
> >       For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
> >       by a pshufd and pand.
> >       (vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
> >       vector equality as a V2DImode vector comparison (see above),
> >       followed by a pshufd and pand.
> >
> > gcc/testsuite/ChangeLog
> >       * gcc.target/i386/sse2-v1ti-veq.c: New test case.
> >       * gcc.target/i386/sse2-v1ti-vne.c: New test case.
> >


+  bool ok;
+  if (!TARGET_SSE4_1)
+    {
+      rtx ops[4];
+      ops[0] = gen_reg_rtx (V4SImode);
+      ops[2] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[2]));
+      ops[3] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[3]));

In general, this is better written as e.g.:

gen_lowpart (V4SImode, force_reg (V2DImode, operands[2]))

This ensures that we get a subreg of V2DImode register, and avoids
problems with gen_lowpart. Also, other expander functions should be
prepared to handle subregs, so in

+  rtx tmp2 = force_reg (V4SImode, gen_lowpart (V4SImode, dst));
+  emit_insn (gen_sse2_pshufd (tmp1, tmp2, GEN_INT (0x4e)));

forcing a subreg to a register before the call to gen_sse2_pshufd is
not needed, since dst is already a register.

Uros.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
@ 2022-04-20 17:49 Roger Sayle
  0 siblings, 0 replies; 3+ messages in thread
From: Roger Sayle @ 2022-04-20 17:49 UTC (permalink / raw)
  To: gcc-patches


This patch (for when the compiler returns to stage 1) improves support
for vector equality and inequality of V1TImode vectors, and V2DImode
vectors with sse2 but not sse4.  Consider the three functions below:

typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));

uv4si eq_v4si(uv4si x, uv4si y) { return x == y; }
uv2di eq_v2di(uv2di x, uv2di y) { return x == y; }
uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }

These all perform vector comparisons of 128bit SSE2 registers, generating
the result as a vector, where ~0 (all 1 bits) represents true and a zero
represents false.  eq_v4si is trivially implemented by x86_64's pcmpeqd
instruction. This patch improves the other two cases:

For v2di, gcc -O2 currently generates:

        movq    %xmm0, %rdx
        movq    %xmm1, %rax
        movdqa  %xmm0, %xmm2
        cmpq    %rax, %rdx
        movhlps %xmm2, %xmm3
        movhlps %xmm1, %xmm4
        sete    %al
        movq    %xmm3, %rdx
        movzbl  %al, %eax
        negq    %rax
        movq    %rax, %xmm0
        movq    %xmm4, %rax
        cmpq    %rax, %rdx
        sete    %al
        movzbl  %al, %eax
        negq    %rax
        movq    %rax, %xmm5
        punpcklqdq      %xmm5, %xmm0
        ret

but with this patch we now generate:

        pcmpeqd %xmm0, %xmm1
        pshufd  $177, %xmm1, %xmm0
        pand    %xmm1, %xmm0
        ret

where the results of a V4SI comparison are shuffled and bit-wise ANDed
to produce the desired result.  There's no change in the code generated
for "-O2 -msse4" where the compiler generates a single "pcmpeqq" insn.

For V1TI mode, the results are equally dramatic, where the current -O2
output looks like:

        movaps  %xmm0, -40(%rsp)
        movq    -40(%rsp), %rax
        movq    -32(%rsp), %rdx
        movaps  %xmm1, -24(%rsp)
        movq    -24(%rsp), %rcx
        movq    -16(%rsp), %rsi
        xorq    %rcx, %rax
        xorq    %rsi, %rdx
        orq     %rdx, %rax
        sete    %al
        xorl    %edx, %edx
        movzbl  %al, %eax
        negq    %rax
        adcq    $0, %rdx
        movq    %rax, %xmm2
        negq    %rdx
        movq    %rdx, -40(%rsp)
        movhps  -40(%rsp), %xmm2
        movdqa  %xmm2, %xmm0
        ret

with this patch we now generate:

        pcmpeqd %xmm0, %xmm1
        pshufd  $177, %xmm1, %xmm0
        pand    %xmm1, %xmm0
        pshufd  $78, %xmm0, %xmm1
        pand    %xmm1, %xmm0
        ret

performing a V2DI comparison, followed by a shuffle and pand, and with
-O2 -msse4 take advantages of SSE4.1's pcmpeqq:

        pcmpeqq %xmm0, %xmm1
        pshufd  $78, %xmm1, %xmm0
        pand    %xmm1, %xmm0
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}, with
no new failures.  Is this OK for when we return to stage 1?


2022-04-20  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
	For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
	by a pshufd and pand.
	(vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
	vector equality as a V2DImode vector comparison (see above),
	followed by a pshufd and pand.

gcc/testsuite/ChangeLog
	* gcc.target/i386/sse2-v1ti-veq.c: New test case.
	* gcc.target/i386/sse2-v1ti-vne.c: New test case.


Roger
--



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-04-21  9:31 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-20 18:28 [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality Roger Sayle
2022-04-21  9:30 ` Uros Bizjak
  -- strict thread matches above, loose matches on Subject: below --
2022-04-20 17:49 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).