public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-2030] i386: Convert ptestz of pandn into ptestc.
@ 2023-06-22  6:44 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2023-06-22  6:44 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:5322f009e8f7d1c7a1c9aab7cb4c90c433398fdd

commit r14-2030-g5322f009e8f7d1c7a1c9aab7cb4c90c433398fdd
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Thu Jun 22 07:43:07 2023 +0100

    i386: Convert ptestz of pandn into ptestc.
    
    This patch is the next installment in a set of backend patches around
    improvements to ptest/vptest.  A previous patch optimized the sequence
    t=pand(x,y); ptestz(t,t) into the equivalent ptestz(x,y), using the
    property that ZF is set to (X&Y) == 0.  This patch performs a similar
    transformation, converting t=pandn(x,y); ptestz(t,t) into the (almost)
    equivalent ptestc(y,x), using the property that the CF flags is set to
    (~X&Y) == 0.  The tricky bit is that this sets the CF flag instead of
    the ZF flag, so we can only perform this transformation when we can
    also convert the flags consumer, as well as the producer.
    
    For the test case:
    
    int foo (__m128i x, __m128i y)
    {
      __m128i a = x & ~y;
      return __builtin_ia32_ptestz128 (a, a);
    }
    
    With -O2 -msse4.1 we previously generated:
    
    foo:    pandn   %xmm0, %xmm1
            xorl    %eax, %eax
            ptest   %xmm1, %xmm1
            sete    %al
            ret
    
    with this patch we now generate:
    
    foo:    xorl    %eax, %eax
            ptest   %xmm0, %xmm1
            setc    %al
            ret
    
    At the same time, this patch also provides alternative fixes for
    PR target/109973 and PR target/110118, by recognizing that ptestc(x,x)
    always sets the carry flag (X&~X is always zero).  This is achieved
    both by recognizing the special case in ix86_expand_sse_ptest and with
    a splitter to convert an eligible ptest into an stc.
    
    2023-06-22  Roger Sayle  <roger@nextmovesoftware.com>
                Uros Bizjak  <ubizjak@gmail.com>
    
    gcc/ChangeLog
            * config/i386/i386-expand.cc (ix86_expand_sse_ptest): Recognize
            expansion of ptestc with equal operands as producing const1_rtx.
            * config/i386/i386.cc (ix86_rtx_costs): Provide accurate cost
            estimates of UNSPEC_PTEST, where the ptest performs the PAND
            or PAND of its operands.
            * config/i386/sse.md (define_split): Transform CCCmode UNSPEC_PTEST
            of reg_equal_p operands into an x86_stc instruction.
            (define_split): Split pandn/ptestz/set{n?}e into ptestc/set{n?}c.
            (define_split): Similar to above for strict_low_part destinations.
            (define_split): Split pandn/ptestz/j{n?}e into ptestc/j{n?}c.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/avx-vptest-4.c: New test case.
            * gcc.target/i386/avx-vptest-5.c: Likewise.
            * gcc.target/i386/avx-vptest-6.c: Likewise.
            * gcc.target/i386/pr109973-1.c: Update test case.
            * gcc.target/i386/pr109973-2.c: Likewise.
            * gcc.target/i386/sse4_1-ptest-4.c: New test case.
            * gcc.target/i386/sse4_1-ptest-5.c: Likewise.
            * gcc.target/i386/sse4_1-ptest-6.c: Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc                 | 12 +++++
 gcc/config/i386/i386.cc                        | 23 +++++----
 gcc/config/i386/sse.md                         | 64 ++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/avx-vptest-4.c   | 21 +++++++++
 gcc/testsuite/gcc.target/i386/avx-vptest-5.c   | 21 +++++++++
 gcc/testsuite/gcc.target/i386/avx-vptest-6.c   | 40 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr109973-1.c     |  2 +-
 gcc/testsuite/gcc.target/i386/pr109973-2.c     |  2 +-
 gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c | 22 +++++++++
 gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c | 22 +++++++++
 gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c | 40 ++++++++++++++++
 11 files changed, 259 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 7bb4d3912a9..9a8d244f85c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10234,6 +10234,18 @@ ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   enum rtx_code comparison = d->comparison;
 
+  /* ptest reg, reg sets the carry flag.  */
+  if (comparison == LTU
+      && (d->code == IX86_BUILTIN_PTESTC
+	  || d->code == IX86_BUILTIN_PTESTC256)
+      && rtx_equal_p (op0, op1))
+    {
+      if (!target)
+	target = gen_reg_rtx (SImode);
+      emit_move_insn (target, const1_rtx);
+      return target;
+    }
+
   if (VECTOR_MODE_P (mode0))
     op0 = safe_vector_operand (op0, mode0);
   if (VECTOR_MODE_P (mode1))
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 32851a514a9..0761965344b 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21423,16 +21423,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
       else if (XINT (x, 1) == UNSPEC_PTEST)
 	{
 	  *total = cost->sse_op;
-	  if (XVECLEN (x, 0) == 2
-	      && GET_CODE (XVECEXP (x, 0, 0)) == AND)
+	  rtx test_op0 = XVECEXP (x, 0, 0);
+	  if (!rtx_equal_p (test_op0, XVECEXP (x, 0, 1)))
+	    return false;
+	  if (GET_CODE (test_op0) == AND)
 	    {
-	      rtx andop = XVECEXP (x, 0, 0);
-	      *total += rtx_cost (XEXP (andop, 0), GET_MODE (andop),
-				  AND, opno, speed)
-			+ rtx_cost (XEXP (andop, 1), GET_MODE (andop),
-				    AND, opno, speed);
-	      return true;
+	      rtx and_op0 = XEXP (test_op0, 0);
+	      if (GET_CODE (and_op0) == NOT)
+		and_op0 = XEXP (and_op0, 0);
+	      *total += rtx_cost (and_op0, GET_MODE (and_op0),
+				  AND, 0, speed)
+			+ rtx_cost (XEXP (test_op0, 1), GET_MODE (and_op0),
+				    AND, 1, speed);
 	    }
+	  else
+	    *total = rtx_cost (test_op0, GET_MODE (test_op0),
+			       UNSPEC, 0, speed);
+	  return true;
 	}
       return false;
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f793258b6c2..a847a2bc327 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -23490,6 +23490,70 @@
   [(set (reg:CCZ FLAGS_REG)
 	(unspec:CCZ [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
 
+;; ptest reg,reg sets the carry flag.
+(define_split
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_operand:V_AVX 0 "register_operand")
+		     (match_operand:V_AVX 1 "register_operand")]
+		    UNSPEC_PTEST))]
+  "TARGET_SSE4_1
+   && rtx_equal_p (operands[0], operands[1])"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(const_int 0)] UNSPEC_STC))])
+
+;; Changing the CCmode of FLAGS_REG requires updating both def and use.
+;; pandn/ptestz/set{n?}e -> ptestc/set{n?}c
+(define_split
+  [(set (match_operand:SWI 0 "register_operand")
+	(match_operator:SWI 3 "bt_comparison_operator"
+	  [(unspec:CCZ [
+	     (and:V_AVX (not:V_AVX (match_operand:V_AVX 1 "register_operand"))
+			(match_operand:V_AVX 2 "register_operand"))
+	     (and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
+	     UNSPEC_PTEST)
+	   (const_int 0)]))]
+  "TARGET_SSE4_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
+   (set (match_dup 0)
+	(match_op_dup 3 [(reg:CCC FLAGS_REG) (const_int 0)]))])
+
+(define_split
+  [(set (strict_low_part (match_operand:QI 0 "register_operand"))
+	(match_operator:QI 3 "bt_comparison_operator"
+	  [(unspec:CCZ [
+	     (and:V_AVX (not:V_AVX (match_operand:V_AVX 1 "register_operand"))
+			(match_operand:V_AVX 2 "register_operand"))
+	     (and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
+	     UNSPEC_PTEST)
+	   (const_int 0)]))]
+  "TARGET_SSE4_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
+   (set (strict_low_part (match_dup 0))
+	(match_op_dup 3 [(reg:CCC FLAGS_REG) (const_int 0)]))])
+
+;; pandn/ptestz/j{n?}e -> ptestc/j{n?}c
+(define_split
+  [(set (pc)
+	(if_then_else
+	  (match_operator 3 "bt_comparison_operator"
+	    [(unspec:CCZ [
+	       (and:V_AVX
+		 (not:V_AVX (match_operand:V_AVX 1 "register_operand"))
+		 (match_operand:V_AVX 2 "register_operand"))
+	       (and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
+	       UNSPEC_PTEST)
+	     (const_int 0)])
+	  (match_operand 0)
+	  (pc)))]
+  "TARGET_SSE4_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
+   (set (pc) (if_then_else (match_op_dup 3 [(reg:CCC FLAGS_REG) (const_int 0)])
+			   (match_dup 0)
+			   (pc)))])
+
 (define_expand "nearbyint<mode>2"
   [(set (match_operand:VFH 0 "register_operand")
 	(unspec:VFH
diff --git a/gcc/testsuite/gcc.target/i386/avx-vptest-4.c b/gcc/testsuite/gcc.target/i386/avx-vptest-4.c
new file mode 100644
index 00000000000..0a234e154ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vptest-4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+int foo (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  return __builtin_ia32_ptestz256 (a, a);
+}
+
+int bar (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  return __builtin_ia32_ptestz256 (a, a);
+}
+
+/* { dg-final { scan-assembler-times "vptest\[ \\t\]+%" 2 } } */
+/* { dg-final { scan-assembler-times "setc" 2 } } */
+/* { dg-final { scan-assembler-not "vpandn" } } */
+/* { dg-final { scan-assembler-not "sete" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vptest-5.c b/gcc/testsuite/gcc.target/i386/avx-vptest-5.c
new file mode 100644
index 00000000000..fd0e5e23c90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vptest-5.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+int foo (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  return !__builtin_ia32_ptestz256 (a, a);
+}
+
+int bar (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  return !__builtin_ia32_ptestz256 (a, a);
+}
+
+/* { dg-final { scan-assembler-times "vptest\[ \\t\]+%" 2} } */
+/* { dg-final { scan-assembler-times "setnc" 2 } } */
+/* { dg-final { scan-assembler-not "vpandn" } } */
+/* { dg-final { scan-assembler-not "setne" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vptest-6.c b/gcc/testsuite/gcc.target/i386/avx-vptest-6.c
new file mode 100644
index 00000000000..5821a92077c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vptest-6.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+extern void ext (void);
+
+void foo (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+void bar (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+void foo2 (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+void bar2 (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 4 } } */
+/* { dg-final { scan-assembler-times "jn?c" 4 } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "jne" } } */
+/* { dg-final { scan-assembler-not "je" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr109973-1.c b/gcc/testsuite/gcc.target/i386/pr109973-1.c
index a1b6136bfea..1d812dd795f 100644
--- a/gcc/testsuite/gcc.target/i386/pr109973-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr109973-1.c
@@ -10,4 +10,4 @@ foo (__m256i x, __m256i y)
   return __builtin_ia32_ptestc256 (a, a);
 }
 
-/* { dg-final { scan-assembler "vpand" } } */
+/* { dg-final { scan-assembler "movl\[ \\t]*\\\$1, %eax" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr109973-2.c b/gcc/testsuite/gcc.target/i386/pr109973-2.c
index 167f6ee02e2..1068c3e1dad 100644
--- a/gcc/testsuite/gcc.target/i386/pr109973-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr109973-2.c
@@ -10,4 +10,4 @@ foo (__m128i x, __m128i y)
   return __builtin_ia32_ptestc128 (a, a);
 }
 
-/* { dg-final { scan-assembler "pand" } } */
+/* { dg-final { scan-assembler "movl\[ \\t]*\\\$1, %eax" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c
new file mode 100644
index 00000000000..e74ddb3d824
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  return __builtin_ia32_ptestz128 (a, a);
+}
+
+int bar (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  return __builtin_ia32_ptestz128 (a, a);
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */
+/* { dg-final { scan-assembler-times "setc" 2 } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "sete" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c
new file mode 100644
index 00000000000..74b0a8c29b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  return !__builtin_ia32_ptestz128 (a, a);
+}
+
+int bar (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  return !__builtin_ia32_ptestz128 (a, a);
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */
+/* { dg-final { scan-assembler-times "setnc" 2 } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "setne" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c
new file mode 100644
index 00000000000..d9114bb24a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+extern void ext (void);
+
+void foo (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+void bar (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+void foo2 (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+void bar2 (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 4 } } */
+/* { dg-final { scan-assembler-times "jn?c" 4 } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "jne" } } */
+/* { dg-final { scan-assembler-not "je" } } */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-06-22  6:44 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-22  6:44 [gcc r14-2030] i386: Convert ptestz of pandn into ptestc Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).