* RE: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
@ 2022-04-20 18:28 Roger Sayle
2022-04-21 9:30 ` Uros Bizjak
0 siblings, 1 reply; 3+ messages in thread
From: Roger Sayle @ 2022-04-20 18:28 UTC (permalink / raw)
To: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 4026 bytes --]
Doh! ENOPATCH.
> -----Original Message-----
> From: Roger Sayle <roger@nextmovesoftware.com>
> Sent: 20 April 2022 18:50
> To: 'gcc-patches@gcc.gnu.org' <gcc-patches@gcc.gnu.org>
> Subject: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
>
>
> This patch (for when the compiler returns to stage 1) improves support for
> vector equality and inequality of V1TImode vectors, and V2DImode vectors
with
> sse2 but not sse4. Consider the three functions below:
>
> typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); typedef
> unsigned long long uv2di __attribute__ ((__vector_size__ (16))); typedef
> unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
>
> uv4si eq_v4si(uv4si x, uv4si y) { return x == y; } uv2di eq_v2di(uv2di x,
uv2di y) {
> return x == y; } uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
>
> These all perform vector comparisons of 128bit SSE2 registers, generating
the
> result as a vector, where ~0 (all 1 bits) represents true and a zero
represents
> false. eq_v4si is trivially implemented by x86_64's pcmpeqd instruction.
This
> patch improves the other two cases:
>
> For v2di, gcc -O2 currently generates:
>
> movq %xmm0, %rdx
> movq %xmm1, %rax
> movdqa %xmm0, %xmm2
> cmpq %rax, %rdx
> movhlps %xmm2, %xmm3
> movhlps %xmm1, %xmm4
> sete %al
> movq %xmm3, %rdx
> movzbl %al, %eax
> negq %rax
> movq %rax, %xmm0
> movq %xmm4, %rax
> cmpq %rax, %rdx
> sete %al
> movzbl %al, %eax
> negq %rax
> movq %rax, %xmm5
> punpcklqdq %xmm5, %xmm0
> ret
>
> but with this patch we now generate:
>
> pcmpeqd %xmm0, %xmm1
> pshufd $177, %xmm1, %xmm0
> pand %xmm1, %xmm0
> ret
>
> where the results of a V4SI comparison are shuffled and bit-wise ANDed to
> produce the desired result. There's no change in the code generated for
"-O2 -
> msse4" where the compiler generates a single "pcmpeqq" insn.
>
> For V1TI mode, the results are equally dramatic, where the current -O2
output
> looks like:
>
> movaps %xmm0, -40(%rsp)
> movq -40(%rsp), %rax
> movq -32(%rsp), %rdx
> movaps %xmm1, -24(%rsp)
> movq -24(%rsp), %rcx
> movq -16(%rsp), %rsi
> xorq %rcx, %rax
> xorq %rsi, %rdx
> orq %rdx, %rax
> sete %al
> xorl %edx, %edx
> movzbl %al, %eax
> negq %rax
> adcq $0, %rdx
> movq %rax, %xmm2
> negq %rdx
> movq %rdx, -40(%rsp)
> movhps -40(%rsp), %xmm2
> movdqa %xmm2, %xmm0
> ret
>
> with this patch we now generate:
>
> pcmpeqd %xmm0, %xmm1
> pshufd $177, %xmm1, %xmm0
> pand %xmm1, %xmm0
> pshufd $78, %xmm0, %xmm1
> pand %xmm1, %xmm0
> ret
>
> performing a V2DI comparison, followed by a shuffle and pand, and with
> -O2 -msse4 take advantages of SSE4.1's pcmpeqq:
>
> pcmpeqq %xmm0, %xmm1
> pshufd $78, %xmm1, %xmm0
> pand %xmm1, %xmm0
> ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
> make -k check, both with and without --target_board=unix{-m32}, with no
new
> failures. Is this OK for when we return to stage 1?
>
>
> 2022-04-20 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
> For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
> by a pshufd and pand.
> (vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
> vector equality as a V2DImode vector comparison (see above),
> followed by a pshufd and pand.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/sse2-v1ti-veq.c: New test case.
> * gcc.target/i386/sse2-v1ti-vne.c: New test case.
>
>
> Roger
> --
[-- Attachment #2: patchvc.txt --]
[-- Type: text/plain, Size: 3894 bytes --]
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a852c16..9bc8fb0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4379,13 +4379,57 @@
(match_operator:V2DI 1 ""
[(match_operand:V2DI 2 "register_operand")
(match_operand:V2DI 3 "vector_operand")]))]
- "TARGET_SSE4_1"
+ "TARGET_SSE2"
{
- bool ok = ix86_expand_int_vec_cmp (operands);
+ bool ok;
+ if (!TARGET_SSE4_1)
+ {
+ rtx ops[4];
+ ops[0] = gen_reg_rtx (V4SImode);
+ ops[2] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[2]));
+ ops[3] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[3]));
+ ops[1] = gen_rtx_fmt_ee (GET_CODE (operands[1]), V4SImode,
+ ops[2], ops[3]);
+ ok = ix86_expand_int_vec_cmp (ops);
+
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_sse2_pshufd (tmp1, ops[0], GEN_INT (0xb1)));
+
+ rtx tmp2 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_andv4si3 (tmp2, tmp1, ops[0]));
+
+ emit_move_insn (operands[0], gen_lowpart (V2DImode, tmp2));
+ }
+ else
+ ok = ix86_expand_int_vec_cmp (operands);
gcc_assert (ok);
DONE;
})
+(define_expand "vec_cmpeqv1tiv1ti"
+ [(set (match_operand:V1TI 0 "register_operand")
+ (match_operator:V1TI 1 ""
+ [(match_operand:V1TI 2 "register_operand")
+ (match_operand:V1TI 3 "vector_operand")]))]
+ "TARGET_SSE2"
+{
+ rtx dst = gen_reg_rtx (V2DImode);
+ rtx op1 = force_reg (V2DImode, gen_lowpart (V2DImode, operands[2]));
+ rtx op2 = force_reg (V2DImode, gen_lowpart (V2DImode, operands[3]));
+ rtx cmp = gen_rtx_fmt_ee (GET_CODE (operands[1]), V2DImode, op1, op2);
+ emit_insn (gen_vec_cmpeqv2div2di (dst, cmp, op1, op2));
+
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ rtx tmp2 = force_reg (V4SImode, gen_lowpart (V4SImode, dst));
+ emit_insn (gen_sse2_pshufd (tmp1, tmp2, GEN_INT (0x4e)));
+
+ rtx tmp3 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_andv4si3 (tmp3, tmp2, tmp1));
+
+ emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
+ DONE;
+})
+
(define_expand "vcond<V_512:mode><VF_512:mode>"
[(set (match_operand:V_512 0 "register_operand")
(if_then_else:V_512
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c
new file mode 100644
index 0000000..8bbda06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2" } */
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
+
+uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
+uv2di eq_v2di(uv2di x, uv2di y) { return x == y; }
+uv4si eq_v4si(uv4si x, uv4si y) { return x == y; }
+
+/* { dg-final { scan-assembler-times "pcmpeq" 3 } } */
+/* { dg-final { scan-assembler "pshufd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
new file mode 100644
index 0000000..cb47147
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2" } */
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
+
+uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x != y; }
+uv2di eq_v2di(uv2di x, uv2di y) { return x != y; }
+uv4si eq_v4si(uv4si x, uv4si y) { return x != y; }
+
+/* { dg-final { scan-assembler-times "pcmpeq" 6 } } */
+/* { dg-final { scan-assembler-times "pxor" 3 } } */
+/* { dg-final { scan-assembler "pshufd" } } */
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
2022-04-20 18:28 [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality Roger Sayle
@ 2022-04-21 9:30 ` Uros Bizjak
0 siblings, 0 replies; 3+ messages in thread
From: Uros Bizjak @ 2022-04-21 9:30 UTC (permalink / raw)
To: Roger Sayle; +Cc: gcc-patches
On Wed, Apr 20, 2022 at 8:28 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> Doh! ENOPATCH.
>
> > -----Original Message-----
> > From: Roger Sayle <roger@nextmovesoftware.com>
> > Sent: 20 April 2022 18:50
> > To: 'gcc-patches@gcc.gnu.org' <gcc-patches@gcc.gnu.org>
> > Subject: [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
> >
> >
> > This patch (for when the compiler returns to stage 1) improves support for
> > vector equality and inequality of V1TImode vectors, and V2DImode vectors
> with
> > sse2 but not sse4. Consider the three functions below:
> >
> > typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); typedef
> > unsigned long long uv2di __attribute__ ((__vector_size__ (16))); typedef
> > unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
> >
> > uv4si eq_v4si(uv4si x, uv4si y) { return x == y; } uv2di eq_v2di(uv2di x,
> uv2di y) {
> > return x == y; } uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
> >
> > These all perform vector comparisons of 128bit SSE2 registers, generating
> the
> > result as a vector, where ~0 (all 1 bits) represents true and a zero
> represents
> > false. eq_v4si is trivially implemented by x86_64's pcmpeqd instruction.
> This
> > patch improves the other two cases:
> >
> > For v2di, gcc -O2 currently generates:
> >
> > movq %xmm0, %rdx
> > movq %xmm1, %rax
> > movdqa %xmm0, %xmm2
> > cmpq %rax, %rdx
> > movhlps %xmm2, %xmm3
> > movhlps %xmm1, %xmm4
> > sete %al
> > movq %xmm3, %rdx
> > movzbl %al, %eax
> > negq %rax
> > movq %rax, %xmm0
> > movq %xmm4, %rax
> > cmpq %rax, %rdx
> > sete %al
> > movzbl %al, %eax
> > negq %rax
> > movq %rax, %xmm5
> > punpcklqdq %xmm5, %xmm0
> > ret
> >
> > but with this patch we now generate:
> >
> > pcmpeqd %xmm0, %xmm1
> > pshufd $177, %xmm1, %xmm0
> > pand %xmm1, %xmm0
> > ret
> >
> > where the results of a V4SI comparison are shuffled and bit-wise ANDed to
> > produce the desired result. There's no change in the code generated for
> "-O2 -
> > msse4" where the compiler generates a single "pcmpeqq" insn.
> >
> > For V1TI mode, the results are equally dramatic, where the current -O2
> output
> > looks like:
> >
> > movaps %xmm0, -40(%rsp)
> > movq -40(%rsp), %rax
> > movq -32(%rsp), %rdx
> > movaps %xmm1, -24(%rsp)
> > movq -24(%rsp), %rcx
> > movq -16(%rsp), %rsi
> > xorq %rcx, %rax
> > xorq %rsi, %rdx
> > orq %rdx, %rax
> > sete %al
> > xorl %edx, %edx
> > movzbl %al, %eax
> > negq %rax
> > adcq $0, %rdx
> > movq %rax, %xmm2
> > negq %rdx
> > movq %rdx, -40(%rsp)
> > movhps -40(%rsp), %xmm2
> > movdqa %xmm2, %xmm0
> > ret
> >
> > with this patch we now generate:
> >
> > pcmpeqd %xmm0, %xmm1
> > pshufd $177, %xmm1, %xmm0
> > pand %xmm1, %xmm0
> > pshufd $78, %xmm0, %xmm1
> > pand %xmm1, %xmm0
> > ret
> >
> > performing a V2DI comparison, followed by a shuffle and pand, and with
> > -O2 -msse4 take advantages of SSE4.1's pcmpeqq:
> >
> > pcmpeqq %xmm0, %xmm1
> > pshufd $78, %xmm1, %xmm0
> > pand %xmm1, %xmm0
> > ret
> >
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
> > make -k check, both with and without --target_board=unix{-m32}, with no
> new
> > failures. Is this OK for when we return to stage 1?
> >
> >
> > 2022-04-20 Roger Sayle <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> > * config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
> > For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
> > by a pshufd and pand.
> > (vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
> > vector equality as a V2DImode vector comparison (see above),
> > followed by a pshufd and pand.
> >
> > gcc/testsuite/ChangeLog
> > * gcc.target/i386/sse2-v1ti-veq.c: New test case.
> > * gcc.target/i386/sse2-v1ti-vne.c: New test case.
> >
+ bool ok;
+ if (!TARGET_SSE4_1)
+ {
+ rtx ops[4];
+ ops[0] = gen_reg_rtx (V4SImode);
+ ops[2] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[2]));
+ ops[3] = force_reg (V4SImode, gen_lowpart (V4SImode, operands[3]));
In general, this is better written as e.g.:
gen_lowpart (V4SImode, force_reg (V2DImode, operands[2]))
This ensures that we get a subreg of V2DImode register, and avoids
problems with gen_lowpart. Also, other expander functions should be
prepared to handle subregs, so in
+ rtx tmp2 = force_reg (V4SImode, gen_lowpart (V4SImode, dst));
+ emit_insn (gen_sse2_pshufd (tmp1, tmp2, GEN_INT (0x4e)));
forcing a subreg to a register before the call to gen_sse2_pshufd is
not needed, since dst is already a register.
Uros.
^ permalink raw reply [flat|nested] 3+ messages in thread
* [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality.
@ 2022-04-20 17:49 Roger Sayle
0 siblings, 0 replies; 3+ messages in thread
From: Roger Sayle @ 2022-04-20 17:49 UTC (permalink / raw)
To: gcc-patches
This patch (for when the compiler returns to stage 1) improves support
for vector equality and inequality of V1TImode vectors, and V2DImode
vectors with sse2 but not sse4. Consider the three functions below:
typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
uv4si eq_v4si(uv4si x, uv4si y) { return x == y; }
uv2di eq_v2di(uv2di x, uv2di y) { return x == y; }
uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
These all perform vector comparisons of 128bit SSE2 registers, generating
the result as a vector, where ~0 (all 1 bits) represents true and a zero
represents false. eq_v4si is trivially implemented by x86_64's pcmpeqd
instruction. This patch improves the other two cases:
For v2di, gcc -O2 currently generates:
movq %xmm0, %rdx
movq %xmm1, %rax
movdqa %xmm0, %xmm2
cmpq %rax, %rdx
movhlps %xmm2, %xmm3
movhlps %xmm1, %xmm4
sete %al
movq %xmm3, %rdx
movzbl %al, %eax
negq %rax
movq %rax, %xmm0
movq %xmm4, %rax
cmpq %rax, %rdx
sete %al
movzbl %al, %eax
negq %rax
movq %rax, %xmm5
punpcklqdq %xmm5, %xmm0
ret
but with this patch we now generate:
pcmpeqd %xmm0, %xmm1
pshufd $177, %xmm1, %xmm0
pand %xmm1, %xmm0
ret
where the results of a V4SI comparison are shuffled and bit-wise ANDed
to produce the desired result. There's no change in the code generated
for "-O2 -msse4" where the compiler generates a single "pcmpeqq" insn.
For V1TI mode, the results are equally dramatic, where the current -O2
output looks like:
movaps %xmm0, -40(%rsp)
movq -40(%rsp), %rax
movq -32(%rsp), %rdx
movaps %xmm1, -24(%rsp)
movq -24(%rsp), %rcx
movq -16(%rsp), %rsi
xorq %rcx, %rax
xorq %rsi, %rdx
orq %rdx, %rax
sete %al
xorl %edx, %edx
movzbl %al, %eax
negq %rax
adcq $0, %rdx
movq %rax, %xmm2
negq %rdx
movq %rdx, -40(%rsp)
movhps -40(%rsp), %xmm2
movdqa %xmm2, %xmm0
ret
with this patch we now generate:
pcmpeqd %xmm0, %xmm1
pshufd $177, %xmm1, %xmm0
pand %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1
pand %xmm1, %xmm0
ret
performing a V2DI comparison, followed by a shuffle and pand, and with
-O2 -msse4 take advantages of SSE4.1's pcmpeqq:
pcmpeqq %xmm0, %xmm1
pshufd $78, %xmm1, %xmm0
pand %xmm1, %xmm0
ret
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}, with
no new failures. Is this OK for when we return to stage 1?
2022-04-20 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
by a pshufd and pand.
(vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
vector equality as a V2DImode vector comparison (see above),
followed by a pshufd and pand.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse2-v1ti-veq.c: New test case.
* gcc.target/i386/sse2-v1ti-vne.c: New test case.
Roger
--
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-04-21 9:31 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-20 18:28 [x86 PATCH] Improved V1TI (and V2DI) mode equality/inequality Roger Sayle
2022-04-21 9:30 ` Uros Bizjak
-- strict thread matches above, loose matches on Subject: below --
2022-04-20 17:49 Roger Sayle
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).