public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [x86 PATCH] Convert ptestz of pandn into ptestc.
@ 2023-06-13 16:03 Roger Sayle
  2023-06-14  8:30 ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Roger Sayle @ 2023-06-13 16:03 UTC (permalink / raw)
  To: gcc-patches; +Cc: 'Uros Bizjak'

[-- Attachment #1: Type: text/plain, Size: 2861 bytes --]


This patch is the next instalment in a set of backend patches around
improvements to ptest/vptest.  A previous patch optimized the sequence
t=pand(x,y); ptestz(t,t) into the equivalent ptestz(x,y), using the
property that ZF is set to (X&Y) == 0.  This patch performs a similar
transformation, converting t=pandn(x,y); ptestz(t,t) into the (almost)
equivalent ptestc(y,x), using the property that the CF flags is set to
(~X&Y) == 0.  The tricky bit is that this sets the CF flag instead of
the ZF flag, so we can only perform this transformation when we can
also convert the flags' consumer, as well as the producer.

For the test case:

int foo (__m128i x, __m128i y)
{
  __m128i a = x & ~y;
  return __builtin_ia32_ptestz128 (a, a);
}

With -O2 -msse4.1 we previously generated:

foo:    pandn   %xmm0, %xmm1
        xorl    %eax, %eax
        ptest   %xmm1, %xmm1
        sete    %al
        ret

with this patch we now generate:

foo:    xorl    %eax, %eax
        ptest   %xmm0, %xmm1
        setc    %al
        ret

At the same time, this patch also provides alternative fixes for
PR target/109973 and PR target/110118, by recognizing that ptestc(x,x)
always sets the carry flag (X&~X is always zero).  This is achieved
both by recognizing the special case in ix86_expand_sse_ptest and with
a splitter to convert an eligible ptest into an stc.

The next piece is, of course, STV of "if (x & ~y)..."

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?

2023-06-13  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/i386/i386-expand.cc (ix86_expand_sse_ptest): Recognize
        expansion of ptestc with equal operands as returning const1_rtx.
        * config/i386/i386.cc (ix86_rtx_costs): Provide accurate cost
        estimates of UNSPEC_PTEST, where the ptest performs the PAND
        or PAND of its operands.
        * config/i386/sse.md (define_split): Transform CCCmode UNSPEC_PTEST
        of reg_equal_p operands into an x86_stc instruction.
        (define_split): Split pandn/ptestz/setne into ptestc/setnc.
        (define_split): Split pandn/ptestz/sete into ptestc/setc.
        (define_split): Split pandn/ptestz/je into ptestc/jc.
        (define_split): Split pandn/ptestz/jne into ptestc/jnc.

gcc/testsuite/ChangeLog
        * gcc.target/i386/avx-vptest-4.c: New test case.
        * gcc.target/i386/avx-vptest-5.c: Likewise.
        * gcc.target/i386/avx-vptest-6.c: Likewise.
        * gcc.target/i386/pr109973-1.c: Update test case.
        * gcc.target/i386/pr109973-2.c: Likewise.
        * gcc.target/i386/sse4_1-ptest-4.c: New test case.
        * gcc.target/i386/sse4_1-ptest-5.c: Likewise.
        * gcc.target/i386/sse4_1-ptest-6.c: Likewise.


Thanks in advance,
Roger
--


[-- Attachment #2: patchpt.txt --]
[-- Type: text/plain, Size: 11575 bytes --]

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index def060a..1d11af2 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10222,6 +10222,13 @@ ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   enum rtx_code comparison = d->comparison;
 
+  /* ptest reg, reg sets the carry flag.  */
+  if (comparison == LTU
+      && (d->code == IX86_BUILTIN_PTESTC
+	  || d->code == IX86_BUILTIN_PTESTC256)
+      && rtx_equal_p (op0, op1))
+    return const1_rtx;
+
   if (VECTOR_MODE_P (mode0))
     op0 = safe_vector_operand (op0, mode0);
   if (VECTOR_MODE_P (mode1))
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3a1444d..3e99e23 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21423,16 +21423,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
       else if (XINT (x, 1) == UNSPEC_PTEST)
 	{
 	  *total = cost->sse_op;
-	  if (XVECLEN (x, 0) == 2
-	      && GET_CODE (XVECEXP (x, 0, 0)) == AND)
+	  rtx test_op0 = XVECEXP (x, 0, 0);
+	  if (!rtx_equal_p (test_op0, XVECEXP (x, 0, 1)))
+	    return false;
+	  if (GET_CODE (test_op0) == AND)
 	    {
-	      rtx andop = XVECEXP (x, 0, 0);
-	      *total += rtx_cost (XEXP (andop, 0), GET_MODE (andop),
-				  AND, opno, speed)
-			+ rtx_cost (XEXP (andop, 1), GET_MODE (andop),
-				    AND, opno, speed);
-	      return true;
+	      rtx and_op0 = XEXP (test_op0, 0);
+	      if (GET_CODE (and_op0) == NOT)
+		and_op0 = XEXP (and_op0, 0);
+	      *total += rtx_cost (and_op0, GET_MODE (and_op0),
+				  AND, 0, speed)
+			+ rtx_cost (XEXP (test_op0, 1), GET_MODE (and_op0),
+				    AND, 1, speed);
 	    }
+	  else
+	    *total = rtx_cost (test_op0, GET_MODE (test_op0),
+			       UNSPEC, 0, speed);
+	  return true;
 	}
       return false;
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9bec09d..282bcbe 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -23147,6 +23147,92 @@
   [(set (reg:CCZ FLAGS_REG)
 	(unspec:CCZ [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
 
+;; ptest reg,reg sets the carry flag.
+(define_split
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_operand:V_AVX 0 "register_operand")
+		     (match_operand:V_AVX 1 "register_operand")]
+		    UNSPEC_PTEST))]
+  "TARGET_SSE4_1
+   && rtx_equal_p (operands[0], operands[1])"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(const_int 0)] UNSPEC_STC))])
+
+;; pandn/ptestz/setne -> ptestc/setnc
+(define_split
+  [(set (match_operand:QI 0 "register_operand")
+	(ne:QI
+	  (unspec:CCZ [
+	    (and:V_AVX (not:V_AVX (match_operand:V_AVX 1 "register_operand"))
+		       (match_operand:V_AVX 2 "register_operand"))
+	    (and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
+	    UNSPEC_PTEST)
+	  (const_int 0)))]
+  "TARGET_SSE4_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
+   (set (strict_low_part (subreg:QI (match_dup 0) 0))
+	(geu:QI (reg:CCC FLAGS_REG) (const_int 0)))])
+
+;; Changing the CCmode of FLAGS_REG requires updating both def and use.
+;; pandn/ptestz/sete -> ptestc/setc
+(define_split
+  [(set (strict_low_part (subreg:QI (match_operand:SI 0 "register_operand") 0))
+	(eq:QI
+	  (unspec:CCZ [
+	    (and:V_AVX (not:V_AVX (match_operand:V_AVX 1 "register_operand"))
+		       (match_operand:V_AVX 2 "register_operand"))
+	    (and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
+	    UNSPEC_PTEST)
+	  (const_int 0)))]
+  "TARGET_SSE4_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
+   (set (strict_low_part (subreg:QI (match_dup 0) 0))
+	(ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))])
+
+;; pandn/ptestz/je -> ptestc/jc
+(define_split
+  [(set (pc)
+	(if_then_else
+	  (ne
+	    (unspec:CCZ [
+	      (and:V_AVX
+		(not:V_AVX (match_operand:V_AVX 1 "register_operand"))
+		(match_operand:V_AVX 2 "register_operand"))
+	      (and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
+	      UNSPEC_PTEST)
+	    (const_int 0))
+	  (match_operand 0)
+	  (pc)))]
+  "TARGET_SSE4_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
+   (set (pc) (if_then_else (geu (reg:CCC FLAGS_REG) (const_int 0))
+			   (match_dup 0)
+			   (pc)))])
+
+;; pandn/ptestz/jne -> ptestc/jnc
+(define_split
+  [(set (pc)
+	(if_then_else
+	  (eq
+	    (unspec:CCZ [
+	      (and:V_AVX
+		(not:V_AVX (match_operand:V_AVX 1 "register_operand"))
+		(match_operand:V_AVX 2 "register_operand"))
+	      (and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
+	      UNSPEC_PTEST)
+	    (const_int 0))
+	  (match_operand 0)
+	  (pc)))]
+  "TARGET_SSE4_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
+   (set (pc) (if_then_else (ltu (reg:CCC FLAGS_REG) (const_int 0))
+			   (match_dup 0)
+			   (pc)))])
+
 (define_expand "nearbyint<mode>2"
   [(set (match_operand:VFH 0 "register_operand")
 	(unspec:VFH
diff --git a/gcc/testsuite/gcc.target/i386/avx-vptest-4.c b/gcc/testsuite/gcc.target/i386/avx-vptest-4.c
new file mode 100644
index 0000000..4f16cc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vptest-4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+int foo (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  return __builtin_ia32_ptestz256 (a, a);
+}
+
+int bar (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  return __builtin_ia32_ptestz256 (a, a);
+}
+
+/* { dg-final { scan-assembler "vptest" } } */
+/* { dg-final { scan-assembler "setc" } } */
+/* { dg-final { scan-assembler-not "vpandn" } } */
+/* { dg-final { scan-assembler-not "sete" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vptest-5.c b/gcc/testsuite/gcc.target/i386/avx-vptest-5.c
new file mode 100644
index 0000000..21b1872
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vptest-5.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+int foo (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  return !__builtin_ia32_ptestz256 (a, a);
+}
+
+int bar (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  return !__builtin_ia32_ptestz256 (a, a);
+}
+
+/* { dg-final { scan-assembler "vptest" } } */
+/* { dg-final { scan-assembler "setnc" } } */
+/* { dg-final { scan-assembler-not "vpandn" } } */
+/* { dg-final { scan-assembler-not "setne" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vptest-6.c b/gcc/testsuite/gcc.target/i386/avx-vptest-6.c
new file mode 100644
index 0000000..c99e65f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vptest-6.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+extern void ext (void);
+
+void foo (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+void bar (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+void foo2 (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+void bar2 (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  if (__builtin_ia32_ptestz256 (a, a))
+    ext();
+}
+
+/* { dg-final { scan-assembler "ptest" } } */
+/* { dg-final { scan-assembler "jn?c" } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "jne" } } */
+/* { dg-final { scan-assembler-not "je" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr109973-1.c b/gcc/testsuite/gcc.target/i386/pr109973-1.c
index a1b6136b..1d812dd 100644
--- a/gcc/testsuite/gcc.target/i386/pr109973-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr109973-1.c
@@ -10,4 +10,4 @@ foo (__m256i x, __m256i y)
   return __builtin_ia32_ptestc256 (a, a);
 }
 
-/* { dg-final { scan-assembler "vpand" } } */
+/* { dg-final { scan-assembler "movl\[ \\t]*\\\$1, %eax" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr109973-2.c b/gcc/testsuite/gcc.target/i386/pr109973-2.c
index 167f6ee..1068c3e 100644
--- a/gcc/testsuite/gcc.target/i386/pr109973-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr109973-2.c
@@ -10,4 +10,4 @@ foo (__m128i x, __m128i y)
   return __builtin_ia32_ptestc128 (a, a);
 }
 
-/* { dg-final { scan-assembler "pand" } } */
+/* { dg-final { scan-assembler "movl\[ \\t]*\\\$1, %eax" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c
new file mode 100644
index 0000000..999cff2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  return __builtin_ia32_ptestz128 (a, a);
+}
+
+int bar (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  return __builtin_ia32_ptestz128 (a, a);
+}
+
+/* { dg-final { scan-assembler "ptest" } } */
+/* { dg-final { scan-assembler "setc" } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "sete" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c
new file mode 100644
index 0000000..c3a23da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  return !__builtin_ia32_ptestz128 (a, a);
+}
+
+int bar (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  return !__builtin_ia32_ptestz128 (a, a);
+}
+
+/* { dg-final { scan-assembler "ptest" } } */
+/* { dg-final { scan-assembler "setnc" } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "setne" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c
new file mode 100644
index 0000000..d49c6bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+extern void ext (void);
+
+void foo (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+void bar (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+void foo2 (__m128i x, __m128i y)
+{
+  __m128i a = x & ~y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+void bar2 (__m128i x, __m128i y)
+{
+  __m128i a = ~x & y;
+  if (__builtin_ia32_ptestz128 (a, a))
+    ext();
+}
+
+/* { dg-final { scan-assembler "ptest" } } */
+/* { dg-final { scan-assembler "jn?c" } } */
+/* { dg-final { scan-assembler-not "pandn" } } */
+/* { dg-final { scan-assembler-not "jne" } } */
+/* { dg-final { scan-assembler-not "je" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-06-19  7:28 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-13 16:03 [x86 PATCH] Convert ptestz of pandn into ptestc Roger Sayle
2023-06-14  8:30 ` Uros Bizjak
2023-06-16 13:27   ` Roger Sayle
2023-06-19  7:28     ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).