From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <sayle@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1251)
 id 6A9BE3858D28; Sat, 23 Oct 2021 09:07:08 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 6A9BE3858D28
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Roger Sayle <sayle@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r12-4639] x86_64: Add insn patterns for V1TI mode logic
 operations.
X-Act-Checkin: gcc
X-Git-Author: Roger Sayle <roger@nextmovesoftware.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 693abdb66aba25f3fb25c3cd8d65dbb64ecd37a0
X-Git-Newrev: 36051875168db600c103277499b092acb4755eab
Message-Id: <20211023090708.6A9BE3858D28@sourceware.org>
Date: Sat, 23 Oct 2021 09:07:08 +0000 (GMT)
X-BeenThere: gcc-cvs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-cvs mailing list <gcc-cvs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-cvs/>
List-Help: <mailto:gcc-cvs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Sat, 23 Oct 2021 09:07:08 -0000

https://gcc.gnu.org/g:36051875168db600c103277499b092acb4755eab

commit r12-4639-g36051875168db600c103277499b092acb4755eab
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Sat Oct 23 10:06:06 2021 +0100

    x86_64: Add insn patterns for V1TI mode logic operations.
    
    On x86_64, V1TI mode holds a 128-bit integer value in a (vector) SSE
    register (where regular TI mode uses a pair of 64-bit general purpose
    scalar registers).  This patch improves the implementation of AND, IOR,
    XOR and NOT on these values.
    
    The benefit is demonstrated by the following simple test program:
    
    typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
    v1ti and(v1ti x, v1ti y) { return x & y; }
    v1ti ior(v1ti x, v1ti y) { return x | y; }
    v1ti xor(v1ti x, v1ti y) { return x ^ y; }
    v1ti not(v1ti x) { return ~x; }
    
    For which GCC currently generates the rather large:
    
    and:    movdqa  %xmm0, %xmm2
            movq    %xmm1, %rdx
            movq    %xmm0, %rax
            andq    %rdx, %rax
            movhlps %xmm2, %xmm3
            movhlps %xmm1, %xmm4
            movq    %rax, %xmm0
            movq    %xmm4, %rdx
            movq    %xmm3, %rax
            andq    %rdx, %rax
            movq    %rax, %xmm5
            punpcklqdq      %xmm5, %xmm0
            ret
    
    ior:    movdqa  %xmm0, %xmm2
            movq    %xmm1, %rdx
            movq    %xmm0, %rax
            orq     %rdx, %rax
            movhlps %xmm2, %xmm3
            movhlps %xmm1, %xmm4
            movq    %rax, %xmm0
            movq    %xmm4, %rdx
            movq    %xmm3, %rax
            orq     %rdx, %rax
            movq    %rax, %xmm5
            punpcklqdq      %xmm5, %xmm0
            ret
    
    xor:    movdqa  %xmm0, %xmm2
            movq    %xmm1, %rdx
            movq    %xmm0, %rax
            xorq    %rdx, %rax
            movhlps %xmm2, %xmm3
            movhlps %xmm1, %xmm4
            movq    %rax, %xmm0
            movq    %xmm4, %rdx
            movq    %xmm3, %rax
            xorq    %rdx, %rax
            movq    %rax, %xmm5
            punpcklqdq      %xmm5, %xmm0
            ret
    
    not:    movdqa  %xmm0, %xmm1
            movq    %xmm0, %rax
            notq    %rax
            movhlps %xmm1, %xmm2
            movq    %rax, %xmm0
            movq    %xmm2, %rax
            notq    %rax
            movq    %rax, %xmm3
            punpcklqdq      %xmm3, %xmm0
            ret
    
    with this patch we now generate the much more efficient:
    
    and:    pand    %xmm1, %xmm0
            ret
    
    ior:    por     %xmm1, %xmm0
            ret
    
    xor:    pxor    %xmm1, %xmm0
            ret
    
    not:    pcmpeqd %xmm1, %xmm1
            pxor    %xmm1, %xmm0
            ret
    
    For my first few attempts at this patch I tried adding V1TI to the
    existing VI and VI12_AVX_512F mode iterators, but these then have
    dependencies on other iterators (and attributes), and so on until
    everything ties itself into a knot, as V1TI mode isn't really a
    first-class vector mode on x86_64.  Hence I ultimately opted to use
    simple stand-alone patterns (as used by the existing TF mode support).
    
    2021-10-23  Roger Sayle  <roger@nextmovesoftware.com>
    
    gcc/ChangeLog
            * config/i386/sse.md (<any_logic>v1ti3): New define_insn to
            implement V1TImode AND, IOR and XOR on TARGET_SSE2 (and above).
            (one_cmplv1ti2): New define expand.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/sse2-v1ti-logic.c: New test case.
            * gcc.target/i386/sse2-v1ti-logic-2.c: New test case.

Diff:
---
 gcc/config/i386/sse.md                            | 25 +++++++++++
 gcc/testsuite/gcc.target/i386/sse2-v1ti-logic-2.c | 53 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-v1ti-logic.c   | 28 ++++++++++++
 3 files changed, 106 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index fbf056bf9e6..f37c5c0e706 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16268,6 +16268,31 @@
 	      ]
 	      (const_string "<sseinsnmode>")))])
 
+(define_insn "<code>v1ti3"
+  [(set (match_operand:V1TI 0 "register_operand" "=x,x,v")
+	(any_logic:V1TI
+	  (match_operand:V1TI 1 "register_operand" "%0,x,v")
+	  (match_operand:V1TI 2 "vector_operand" "xBm,xm,vm")))]
+  "TARGET_SSE2"
+  "@
+   p<logic>\t{%2, %0|%0, %2}
+   vp<logic>\t{%2, %1, %0|%0, %1, %2}
+   vp<logic>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx,avx")
+   (set_attr "prefix" "orig,vex,evex")
+   (set_attr "prefix_data16" "1,*,*")
+   (set_attr "type" "sselog")
+   (set_attr "mode" "TI")])
+
+(define_expand "one_cmplv1ti2"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(xor:V1TI (match_operand:V1TI 1 "register_operand")
+		  (match_dup 2)))]
+  "TARGET_SSE2"
+{
+  operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode));
+})
+
 (define_mode_iterator AVX512ZEXTMASK
   [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI])
 
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-logic-2.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-logic-2.c
new file mode 100644
index 00000000000..3ec64555334
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-logic-2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
+
+v1ti x;
+v1ti y;
+v1ti z;
+
+void and2()
+{
+  x &= y;
+}
+
+void and3()
+{
+  x = y & z;
+}
+
+void ior2()
+{
+  x |= y;
+}
+
+void ior3()
+{
+  x = y | z;
+}
+
+
+void xor2()
+{
+  x ^= y;
+}
+
+void xor3()
+{
+  x =  y ^ z;
+}
+
+void not1()
+{
+  x = ~x;
+}
+
+void not2()
+{
+  x = ~y;
+}
+
+/* { dg-final { scan-assembler-times "pand" 2 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+/* { dg-final { scan-assembler-times "pxor" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-logic.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-logic.c
new file mode 100644
index 00000000000..130a89b91c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-logic.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
+
+v1ti and(v1ti x, v1ti y)
+{
+  return x & y;
+}
+
+v1ti ior(v1ti x, v1ti y)
+{
+  return x | y;
+}
+
+v1ti xor(v1ti x, v1ti y)
+{
+  return x ^ y;
+}
+
+v1ti not(v1ti x)
+{
+  return ~x;
+}
+
+/* { dg-final { scan-assembler "pand" } } */
+/* { dg-final { scan-assembler "por" } } */
+/* { dg-final { scan-assembler-times "pxor" 2 } } */