* [PATCH] x86_64: Add insn patterns for V1TI mode logic operations.
@ 2021-10-22 7:19 Roger Sayle
2021-10-22 14:53 ` Uros Bizjak
0 siblings, 1 reply; 2+ messages in thread
From: Roger Sayle @ 2021-10-22 7:19 UTC (permalink / raw)
To: 'GCC Patches'
[-- Attachment #1: Type: text/plain, Size: 3123 bytes --]
On x86_64, V1TI mode holds a 128-bit integer value in a (vector) SSE
register (where regular TI mode uses a pair of 64-bit general purpose
scalar registers). This patch improves the implementation of AND, IOR,
XOR and NOT on these values.
The benefit is demonstrated by the following simple test program:
typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
v1ti and(v1ti x, v1ti y) { return x & y; }
v1ti ior(v1ti x, v1ti y) { return x | y; }
v1ti xor(v1ti x, v1ti y) { return x ^ y; }
v1ti not(v1ti x) { return ~x; }
For which GCC currently generates the rather large:
and: movdqa %xmm0, %xmm2
movq %xmm1, %rdx
movq %xmm0, %rax
andq %rdx, %rax
movhlps %xmm2, %xmm3
movhlps %xmm1, %xmm4
movq %rax, %xmm0
movq %xmm4, %rdx
movq %xmm3, %rax
andq %rdx, %rax
movq %rax, %xmm5
punpcklqdq %xmm5, %xmm0
ret
ior: movdqa %xmm0, %xmm2
movq %xmm1, %rdx
movq %xmm0, %rax
orq %rdx, %rax
movhlps %xmm2, %xmm3
movhlps %xmm1, %xmm4
movq %rax, %xmm0
movq %xmm4, %rdx
movq %xmm3, %rax
orq %rdx, %rax
movq %rax, %xmm5
punpcklqdq %xmm5, %xmm0
ret
xor: movdqa %xmm0, %xmm2
movq %xmm1, %rdx
movq %xmm0, %rax
xorq %rdx, %rax
movhlps %xmm2, %xmm3
movhlps %xmm1, %xmm4
movq %rax, %xmm0
movq %xmm4, %rdx
movq %xmm3, %rax
xorq %rdx, %rax
movq %rax, %xmm5
punpcklqdq %xmm5, %xmm0
ret
not: movdqa %xmm0, %xmm1
movq %xmm0, %rax
notq %rax
movhlps %xmm1, %xmm2
movq %rax, %xmm0
movq %xmm2, %rax
notq %rax
movq %rax, %xmm3
punpcklqdq %xmm3, %xmm0
ret
with this patch we now generate the much more efficient:
and: pand %xmm1, %xmm0
ret
ior: por %xmm1, %xmm0
ret
xor: pxor %xmm1, %xmm0
ret
not: pcmpeqd %xmm1, %xmm1
pxor %xmm1, %xmm0
ret
For my first few attempts at this patch I tried adding V1TI to the
existing VI and VI12_AVX_512F mode iterators, but these then have
dependencies on other iterators (and attributes), and so on until
everything ties itself into a knot, as V1TI mode isn't really a
first-class vector mode on x86_64. Hence I ultimately opted to use
simple stand-alone patterns (as used by the existing TF mode support).
This patch has been tested on x86_64-pc-linux-gnu with "make bootstrap"
and "make -k check" with no new failures. Ok for mainline?
2021-10-22 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/sse.md (<any_logic>v1ti3): New define_insn to
implement V1TImode AND, IOR and XOR on TARGET_SSE2 (and above).
(one_cmplv1ti2): New define expand.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse2-v1ti-logic.c: New test case.
* gcc.target/i386/sse2-v1ti-logic-2.c: New test case.
Thanks in advance,
Roger
--
[-- Attachment #2: patchv.txt --]
[-- Type: text/plain, Size: 1138 bytes --]
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index fbf056b..f37c5c0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16268,6 +16268,31 @@
]
(const_string "<sseinsnmode>")))])
+(define_insn "<code>v1ti3"
+ [(set (match_operand:V1TI 0 "register_operand" "=x,x,v")
+ (any_logic:V1TI
+ (match_operand:V1TI 1 "register_operand" "%0,x,v")
+ (match_operand:V1TI 2 "vector_operand" "xBm,xm,vm")))]
+ "TARGET_SSE2"
+ "@
+ p<logic>\t{%2, %0|%0, %2}
+ vp<logic>\t{%2, %1, %0|%0, %1, %2}
+ vp<logic>\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "noavx,avx,avx")
+ (set_attr "prefix" "orig,vex,evex")
+ (set_attr "prefix_data16" "1,*,*")
+ (set_attr "type" "sselog")
+ (set_attr "mode" "TI")])
+
+(define_expand "one_cmplv1ti2"
+ [(set (match_operand:V1TI 0 "register_operand")
+ (xor:V1TI (match_operand:V1TI 1 "register_operand")
+ (match_dup 2)))]
+ "TARGET_SSE2"
+{
+ operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode));
+})
+
(define_mode_iterator AVX512ZEXTMASK
[(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI])
[-- Attachment #3: sse2-v1ti-logic.c --]
[-- Type: text/plain, Size: 536 bytes --]
/* { dg-do compile { target int128 } } */
/* { dg-options "-O2 -msse2" } */
/* { dg-require-effective-target sse2 } */
typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
v1ti and(v1ti x, v1ti y)
{
return x & y;
}
v1ti ior(v1ti x, v1ti y)
{
return x | y;
}
v1ti xor(v1ti x, v1ti y)
{
return x ^ y;
}
v1ti not(v1ti x)
{
return ~x;
}
/* { dg-final { scan-assembler "pand" } } */
/* { dg-final { scan-assembler "por" } } */
/* { dg-final { scan-assembler-times "pxor" 2 } } */
[-- Attachment #4: sse2-v1ti-logic-2.c --]
[-- Type: text/plain, Size: 656 bytes --]
/* { dg-do compile { target int128 } } */
/* { dg-options "-O2 -msse2" } */
/* { dg-require-effective-target sse2 } */
typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
v1ti x;
v1ti y;
v1ti z;
void and2()
{
x &= y;
}
void and3()
{
x = y & z;
}
void ior2()
{
x |= y;
}
void ior3()
{
x = y | z;
}
void xor2()
{
x ^= y;
}
void xor3()
{
x = y ^ z;
}
void not1()
{
x = ~x;
}
void not2()
{
x = ~y;
}
/* { dg-final { scan-assembler-times "pand" 2 } } */
/* { dg-final { scan-assembler-times "por" 2 } } */
/* { dg-final { scan-assembler-times "pxor" 4 } } */
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] x86_64: Add insn patterns for V1TI mode logic operations.
2021-10-22 7:19 [PATCH] x86_64: Add insn patterns for V1TI mode logic operations Roger Sayle
@ 2021-10-22 14:53 ` Uros Bizjak
0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2021-10-22 14:53 UTC (permalink / raw)
To: Roger Sayle; +Cc: GCC Patches
On Fri, Oct 22, 2021 at 9:19 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> On x86_64, V1TI mode holds a 128-bit integer value in a (vector) SSE
> register (where regular TI mode uses a pair of 64-bit general purpose
> scalar registers). This patch improves the implementation of AND, IOR,
> XOR and NOT on these values.
>
> The benefit is demonstrated by the following simple test program:
>
> typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
> v1ti and(v1ti x, v1ti y) { return x & y; }
> v1ti ior(v1ti x, v1ti y) { return x | y; }
> v1ti xor(v1ti x, v1ti y) { return x ^ y; }
> v1ti not(v1ti x) { return ~x; }
>
> For which GCC currently generates the rather large:
>
> and: movdqa %xmm0, %xmm2
> movq %xmm1, %rdx
> movq %xmm0, %rax
> andq %rdx, %rax
> movhlps %xmm2, %xmm3
> movhlps %xmm1, %xmm4
> movq %rax, %xmm0
> movq %xmm4, %rdx
> movq %xmm3, %rax
> andq %rdx, %rax
> movq %rax, %xmm5
> punpcklqdq %xmm5, %xmm0
> ret
>
> ior: movdqa %xmm0, %xmm2
> movq %xmm1, %rdx
> movq %xmm0, %rax
> orq %rdx, %rax
> movhlps %xmm2, %xmm3
> movhlps %xmm1, %xmm4
> movq %rax, %xmm0
> movq %xmm4, %rdx
> movq %xmm3, %rax
> orq %rdx, %rax
> movq %rax, %xmm5
> punpcklqdq %xmm5, %xmm0
> ret
>
> xor: movdqa %xmm0, %xmm2
> movq %xmm1, %rdx
> movq %xmm0, %rax
> xorq %rdx, %rax
> movhlps %xmm2, %xmm3
> movhlps %xmm1, %xmm4
> movq %rax, %xmm0
> movq %xmm4, %rdx
> movq %xmm3, %rax
> xorq %rdx, %rax
> movq %rax, %xmm5
> punpcklqdq %xmm5, %xmm0
> ret
>
> not: movdqa %xmm0, %xmm1
> movq %xmm0, %rax
> notq %rax
> movhlps %xmm1, %xmm2
> movq %rax, %xmm0
> movq %xmm2, %rax
> notq %rax
> movq %rax, %xmm3
> punpcklqdq %xmm3, %xmm0
> ret
>
>
> with this patch we now generate the much more efficient:
>
> and: pand %xmm1, %xmm0
> ret
>
> ior: por %xmm1, %xmm0
> ret
>
> xor: pxor %xmm1, %xmm0
> ret
>
> not: pcmpeqd %xmm1, %xmm1
> pxor %xmm1, %xmm0
> ret
>
>
> For my first few attempts at this patch I tried adding V1TI to the
> existing VI and VI12_AVX_512F mode iterators, but these then have
> dependencies on other iterators (and attributes), and so on until
> everything ties itself into a knot, as V1TI mode isn't really a
> first-class vector mode on x86_64. Hence I ultimately opted to use
> simple stand-alone patterns (as used by the existing TF mode support).
>
> This patch has been tested on x86_64-pc-linux-gnu with "make bootstrap"
> and "make -k check" with no new failures. Ok for mainline?
>
>
> 2021-10-22 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * config/i386/sse.md (<any_logic>v1ti3): New define_insn to
> implement V1TImode AND, IOR and XOR on TARGET_SSE2 (and above).
> (one_cmplv1ti2): New define expand.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/sse2-v1ti-logic.c: New test case.
> * gcc.target/i386/sse2-v1ti-logic-2.c: New test case.
There is no need for
/* { dg-require-effective-target sse2 } */
for compile tests. The compilation does not reach the assembler.
OK with the above change.
BTW: You can add testcases to the main patch with "git add <filename>"
and then create the patch with "git diff HEAD".
Thanks,
Uros.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-10-22 14:53 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-22 7:19 [PATCH] x86_64: Add insn patterns for V1TI mode logic operations Roger Sayle
2021-10-22 14:53 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).