RE: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: "Roger Sayle" <roger@nextmovesoftware.com>
To: <gcc-patches@gcc.gnu.org>
Subject: RE: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
Date: Wed, 20 Apr 2022 19:28:09 +0100	[thread overview]
Message-ID: <011e01d854e4$63daa5e0$2b8ff1a0$@nextmovesoftware.com> (raw)
In-Reply-To: 

[-- Attachment #1: Type: text/plain, Size: 4026 bytes --]


Doh! ENOPATCH.

> -----Original Message-----
> From: Roger Sayle <roger@nextmovesoftware.com>
> Sent: 20 April 2022 18:50
> To: 'gcc-patches@gcc.gnu.org' <gcc-patches@gcc.gnu.org>
> Subject: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
> 
> 
> This patch (for when the compiler returns to stage 1) improves support for
> vector equality and inequality of V1TImode vectors, and V2DImode vectors
with
> sse2 but not sse4.  Consider the three functions below:
> 
> typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); typedef
> unsigned long long uv2di __attribute__ ((__vector_size__ (16))); typedef
> unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
> 
> uv4si eq_v4si(uv4si x, uv4si y) { return x == y; } uv2di eq_v2di(uv2di x,
uv2di y) {
> return x == y; } uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
> 
> These all perform vector comparisons of 128bit SSE2 registers, generating
the
> result as a vector, where ~0 (all 1 bits) represents true and a zero
represents
> false.  eq_v4si is trivially implemented by x86_64's pcmpeqd instruction.
This
> patch improves the other two cases:
> 
> For v2di, gcc -O2 currently generates:
> 
>         movq    %xmm0, %rdx
>         movq    %xmm1, %rax
>         movdqa  %xmm0, %xmm2
>         cmpq    %rax, %rdx
>         movhlps %xmm2, %xmm3
>         movhlps %xmm1, %xmm4
>         sete    %al
>         movq    %xmm3, %rdx
>         movzbl  %al, %eax
>         negq    %rax
>         movq    %rax, %xmm0
>         movq    %xmm4, %rax
>         cmpq    %rax, %rdx
>         sete    %al
>         movzbl  %al, %eax
>         negq    %rax
>         movq    %rax, %xmm5
>         punpcklqdq      %xmm5, %xmm0
>         ret
> 
> but with this patch we now generate:
> 
>         pcmpeqd %xmm0, %xmm1
>         pshufd  $177, %xmm1, %xmm0
>         pand    %xmm1, %xmm0
>         ret
> 
> where the results of a V4SI comparison are shuffled and bit-wise ANDed to
> produce the desired result.  There's no change in the code generated for
"-O2 -
> msse4" where the compiler generates a single "pcmpeqq" insn.
> 
> For V1TI mode, the results are equally dramatic, where the current -O2
output
> looks like:
> 
>         movaps  %xmm0, -40(%rsp)
>         movq    -40(%rsp), %rax
>         movq    -32(%rsp), %rdx
>         movaps  %xmm1, -24(%rsp)
>         movq    -24(%rsp), %rcx
>         movq    -16(%rsp), %rsi
>         xorq    %rcx, %rax
>         xorq    %rsi, %rdx
>         orq     %rdx, %rax
>         sete    %al
>         xorl    %edx, %edx
>         movzbl  %al, %eax
>         negq    %rax
>         adcq    $0, %rdx
>         movq    %rax, %xmm2
>         negq    %rdx
>         movq    %rdx, -40(%rsp)
>         movhps  -40(%rsp), %xmm2
>         movdqa  %xmm2, %xmm0
>         ret
> 
> with this patch we now generate:
> 
>         pcmpeqd %xmm0, %xmm1
>         pshufd  $177, %xmm1, %xmm0
>         pand    %xmm1, %xmm0
>         pshufd  $78, %xmm0, %xmm1
>         pand    %xmm1, %xmm0
>         ret
> 
> performing a V2DI comparison, followed by a shuffle and pand, and with
> -O2 -msse4 take advantages of SSE4.1's pcmpeqq:
> 
>         pcmpeqq %xmm0, %xmm1
>         pshufd  $78, %xmm1, %xmm0
>         pand    %xmm1, %xmm0
>         ret
> 
> 
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
> make -k check, both with and without --target_board=unix{-m32}, with no
new
> failures.  Is this OK for when we return to stage 1?
> 
> 
> 2022-04-20  Roger Sayle  <roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
> 	* config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
> 	For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
> 	by a pshufd and pand.
> 	(vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
> 	vector equality as a V2DImode vector comparison (see above),
> 	followed by a pshufd and pand.
> 
> gcc/testsuite/ChangeLog
> 	* gcc.target/i386/sse2-v1ti-veq.c: New test case.
> 	* gcc.target/i386/sse2-v1ti-vne.c: New test case.
> 
> 
> Roger
> --


[-- Attachment #2: patchvc.txt --]
[-- Type: text/plain, Size: 3894 bytes --]

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a852c16..9bc8fb0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4379,13 +4379,57 @@
 	(match_operator:V2DI 1 ""
 	  [(match_operand:V2DI 2 "register_operand")
 	   (match_operand:V2DI 3 "vector_operand")]))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE2"
 {
-  bool ok = ix86_expand_int_vec_cmp (operands);
+  bool ok;
+  if (!TARGET_SSE4_1)
+    {
+      rtx ops[4];
+      ops[0] = gen_reg_rtx (V4SImode);
+      ops[2] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[2]));
+      ops[3] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[3]));
+      ops[1] = gen_rtx_fmt_ee (GET_CODE (operands[1]), V4SImode,
+			       ops[2], ops[3]);
+      ok = ix86_expand_int_vec_cmp (ops);
+
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_pshufd (tmp1, ops[0], GEN_INT (0xb1)));
+
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_andv4si3 (tmp2, tmp1, ops[0]));
+
+      emit_move_insn (operands[0], gen_lowpart (V2DImode, tmp2));
+    }
+  else
+    ok = ix86_expand_int_vec_cmp (operands);
   gcc_assert (ok);
   DONE;
 })
 
+(define_expand "vec_cmpeqv1tiv1ti"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(match_operator:V1TI 1 ""
+	  [(match_operand:V1TI 2 "register_operand")
+	   (match_operand:V1TI 3 "vector_operand")]))]
+  "TARGET_SSE2"
+{
+  rtx dst = gen_reg_rtx (V2DImode);
+  rtx op1 = force_reg (V2DImode, gen_lowpart (V2DImode, operands[2]));
+  rtx op2 = force_reg (V2DImode, gen_lowpart (V2DImode, operands[3]));
+  rtx cmp = gen_rtx_fmt_ee (GET_CODE (operands[1]), V2DImode, op1, op2);
+  emit_insn (gen_vec_cmpeqv2div2di (dst, cmp, op1, op2));
+
+  rtx tmp1 = gen_reg_rtx (V4SImode);
+  rtx tmp2 = force_reg (V4SImode, gen_lowpart (V4SImode, dst));
+  emit_insn (gen_sse2_pshufd (tmp1, tmp2, GEN_INT (0x4e)));
+
+  rtx tmp3 = gen_reg_rtx (V4SImode);
+  emit_insn (gen_andv4si3 (tmp3, tmp2, tmp1));
+
+  emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
+  DONE;
+})
+
 (define_expand "vcond<V_512:mode><VF_512:mode>"
   [(set (match_operand:V_512 0 "register_operand")
 	(if_then_else:V_512
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c
new file mode 100644
index 0000000..8bbda06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2" } */
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
+
+uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
+uv2di eq_v2di(uv2di x, uv2di y) { return x == y; }
+uv4si eq_v4si(uv4si x, uv4si y) { return x == y; }
+
+/* { dg-final { scan-assembler-times "pcmpeq" 3 } } */
+/* { dg-final { scan-assembler "pshufd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
new file mode 100644
index 0000000..cb47147
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2" } */
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
+
+uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x != y; }
+uv2di eq_v2di(uv2di x, uv2di y) { return x != y; }
+uv4si eq_v4si(uv4si x, uv4si y) { return x != y; }
+
+/* { dg-final { scan-assembler-times "pcmpeq" 6 } } */
+/* { dg-final { scan-assembler-times "pxor" 3 } } */
+/* { dg-final { scan-assembler "pshufd" } } */

next             reply	other threads:[~2022-04-20 18:28 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-20 18:28 Roger Sayle [this message]
2022-04-21  9:30 ` Uros Bizjak
  -- strict thread matches above, loose matches on Subject: below --
2022-04-20 17:49 Roger Sayle

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='011e01d854e4$63daa5e0$2b8ff1a0$@nextmovesoftware.com' \
    --to=roger@nextmovesoftware.com \
    --cc=gcc-patches@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).