public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-2006] Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
@ 2022-08-09 18:04 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2022-08-09 18:04 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:a56c1641e9d25e46059168e811b4a2f185f07b6b

commit r13-2006-ga56c1641e9d25e46059168e811b4a2f185f07b6b
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Tue Aug 9 18:59:55 2022 +0100

    Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
    
    This x86_64 backend patch allows TImode STV to take advantage of the
    fact that the PTEST instruction performs an AND operation.  Previously
    PTEST was (mostly) used for comparison against zero, by using the same
    operands.  The benefits are demonstrated by the new test case:
    
    __int128 a,b;
    int foo()
    {
      return (a & b) != 0;
    }
    
    Currently with -O2 -msse4 we generate:
    
            movdqa  a(%rip), %xmm0
            pand    b(%rip), %xmm0
            xorl    %eax, %eax
            ptest   %xmm0, %xmm0
            setne   %al
            ret
    
    with this patch we now generate:
    
            movdqa  a(%rip), %xmm0
            xorl    %eax, %eax
            ptest   b(%rip), %xmm0
            setne   %al
            ret
    
    Technically, the magic happens using new define_insn_and_split patterns.
    Using two patterns allows this transformation to performed independently
    of whether TImode STV is run before or after combine.  The one tricky
    case is that immediate constant operands of the AND behave slightly
    differently between TImode and V1TImode: All V1TImode immediate operands
    becomes loads, but for TImode only values that are not hilo_operands
    need to be loaded.  Hence the new *testti_doubleword accepts any
    general_operand, but internally during split calls force_reg whenever
    the second operand is not x86_64_hilo_general_operand.  This required
    (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
    more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
    
    2022-08-09  Roger Sayle  <roger@nextmovesoftware.com>
    
    gcc/ChangeLog
            * config/i386/i386-features.cc (scalar_chain::convert_compare):
            Create new pseudos only when/if needed.  Add support for TEST,
            i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
            When broadcasting V2DImode and V4SImode use new pseudo register.
            (timode_scalar_chain::convert_op): Do nothing if operand is
            already V1TImode.  Avoid generating useless SUBREG conversions,
            i.e. (SUBREG:V1TImode (REG:V1TImode) 0).  Handle CONST_WIDE_INT
            in addition to CONST_INT by using CONST_SCALAR_INT_P.
            (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
            CONST_WIDE_INT and CONST_INT.  Recognize new *testti_doubleword
            pattern as an STV candidate.
            (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
            operands in binary logic operations.
    
            * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
            for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
            as regular PTEST, i.e. cost->sse_op.
    
            * config/i386/i386.md (*testti_doubleword): New pre-reload
            define_insn_and_split that recognizes comparison of TI mode AND
            against zero.
            * config/i386/sse.md (*ptest<mode>_and): New pre-reload
            define_insn_and_split that recognizes UNSPEC_PTEST of identical
            AND operands.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/sse4_1-stv-8.c: New test case.

Diff:
---
 gcc/config/i386/i386-features.cc             | 95 ++++++++++++++++++++--------
 gcc/config/i386/i386.cc                      | 16 ++++-
 gcc/config/i386/i386.md                      | 21 ++++++
 gcc/config/i386/sse.md                       | 13 ++++
 gcc/testsuite/gcc.target/i386/sse4_1-stv-8.c | 11 ++++
 5 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 5e3a7ffacb4..effc2f24494 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -919,8 +919,7 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 rtx
 scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
 {
-  rtx tmp = gen_reg_rtx (vmode);
-  rtx src;
+  rtx src, tmp;
   /* Comparison against anything other than zero, requires an XOR.  */
   if (op2 != const0_rtx)
     {
@@ -929,6 +928,7 @@ scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
       /* If both operands are MEMs, explicitly load the OP1 into TMP.  */
       if (MEM_P (op1) && MEM_P (op2))
 	{
+	  tmp = gen_reg_rtx (vmode);
 	  emit_insn_before (gen_rtx_SET (tmp, op1), insn);
 	  src = tmp;
 	}
@@ -943,34 +943,56 @@ scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
       rtx op12 = XEXP (op1, 1);
       convert_op (&op11, insn);
       convert_op (&op12, insn);
-      if (MEM_P (op11))
+      if (!REG_P (op11))
 	{
+	  tmp = gen_reg_rtx (vmode);
 	  emit_insn_before (gen_rtx_SET (tmp, op11), insn);
 	  op11 = tmp;
 	}
       src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
     }
+  else if (GET_CODE (op1) == AND)
+    {
+      rtx op11 = XEXP (op1, 0);
+      rtx op12 = XEXP (op1, 1);
+      convert_op (&op11, insn);
+      convert_op (&op12, insn);
+      if (!REG_P (op11))
+	{
+	  tmp = gen_reg_rtx (vmode);
+	  emit_insn_before (gen_rtx_SET (tmp, op11), insn);
+	  op11 = tmp;
+	}
+      return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, op11, op12),
+			     UNSPEC_PTEST);
+    }
   else
     {
       convert_op (&op1, insn);
       src = op1;
     }
-  emit_insn_before (gen_rtx_SET (tmp, src), insn);
+
+  if (!REG_P (src))
+    {
+      tmp = gen_reg_rtx (vmode);
+      emit_insn_before (gen_rtx_SET (tmp, src), insn);
+      src = tmp;
+    }
 
   if (vmode == V2DImode)
-    emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (tmp),
-						  copy_rtx_if_shared (tmp),
-						  copy_rtx_if_shared (tmp)),
-		      insn);
+    {
+      tmp = gen_reg_rtx (vmode);
+      emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
+      src = tmp;
+    }
   else if (vmode == V4SImode)
-    emit_insn_before (gen_sse2_pshufd (copy_rtx_if_shared (tmp),
-				       copy_rtx_if_shared (tmp),
-				       const0_rtx),
-		      insn);
-
-  return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (tmp),
-					       copy_rtx_if_shared (tmp)),
-			 UNSPEC_PTEST);
+    {
+      tmp = gen_reg_rtx (vmode);
+      emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
+      src = tmp;
+    }
+
+  return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
 }
 
 /* Helper function for converting INSN to vector mode.  */
@@ -1289,6 +1311,9 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg)
 void
 timode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 {
+  if (GET_MODE (*op) == V1TImode)
+    return;
+
   *op = copy_rtx_if_shared (*op);
 
   if (REG_P (*op))
@@ -1296,19 +1321,19 @@ timode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
   else if (MEM_P (*op))
     {
       rtx tmp = gen_reg_rtx (V1TImode);
-      emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (V1TImode, tmp, 0),
+      emit_insn_before (gen_rtx_SET (tmp,
 				     gen_gpr_to_xmm_move_src (V1TImode, *op)),
 			insn);
-      *op = gen_rtx_SUBREG (V1TImode, tmp, 0);
+      *op = tmp;
 
       if (dump_file)
 	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
 		 INSN_UID (insn), REGNO (tmp));
     }
-  else if (CONST_INT_P (*op))
+  else if (CONST_SCALAR_INT_P (*op))
     {
       rtx vec_cst;
-      rtx tmp = gen_rtx_SUBREG (V1TImode, gen_reg_rtx (TImode), 0);
+      rtx tmp = gen_reg_rtx (V1TImode);
 
       /* Prefer all ones vector in case of -1.  */
       if (constm1_operand (*op, TImode))
@@ -1329,7 +1354,7 @@ timode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 	  emit_insn_before (seq, insn);
 	}
 
-      emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
+      emit_insn_before (gen_move_insn (tmp, vec_cst), insn);
       *op = tmp;
     }
   else
@@ -1609,14 +1634,26 @@ convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
   rtx op2 = XEXP (src, 1);
 
   /* *cmp<dwi>_doubleword.  */
-  if ((CONST_INT_P (op1)
+  if ((CONST_SCALAR_INT_P (op1)
        || ((REG_P (op1) || MEM_P (op1))
 	   && GET_MODE (op1) == mode))
-      && (CONST_INT_P (op2)
+      && (CONST_SCALAR_INT_P (op2)
 	  || ((REG_P (op2) || MEM_P (op2))
 	      && GET_MODE (op2) == mode)))
     return true;
 
+  /* *testti_doubleword.  */
+  if (op2 == const0_rtx
+      && GET_CODE (op1) == AND
+      && REG_P (XEXP (op1, 0)))
+    {
+      rtx op12 = XEXP (op1, 1);
+      return GET_MODE (XEXP (op1, 0)) == TImode
+	     && (CONST_SCALAR_INT_P (op12)
+		 || ((REG_P (op12) || MEM_P (op12))
+		     && GET_MODE (op12) == TImode));
+    }
+
   /* *test<dwi>_not_doubleword.  */
   if (op2 == const0_rtx
       && GET_CODE (op1) == AND
@@ -1803,15 +1840,21 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
       if (!MEM_P (dst)
 	  && GET_CODE (XEXP (src, 0)) == NOT
 	  && REG_P (XEXP (XEXP (src, 0), 0))
-	  && (REG_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1))))
+	  && (REG_P (XEXP (src, 1))
+	      || CONST_SCALAR_INT_P (XEXP (src, 1))
+	      || timode_mem_p (XEXP (src, 1))))
 	return true;
       return REG_P (XEXP (src, 0))
-	     && (REG_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1)));
+	     && (REG_P (XEXP (src, 1))
+		 || CONST_SCALAR_INT_P (XEXP (src, 1))
+		 || timode_mem_p (XEXP (src, 1)));
 
     case IOR:
     case XOR:
       return REG_P (XEXP (src, 0))
-	     && (REG_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1)));
+	     && (REG_P (XEXP (src, 1))
+		 || CONST_SCALAR_INT_P (XEXP (src, 1))
+		 || timode_mem_p (XEXP (src, 1)));
 
     case NOT:
       return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5e30dc884bf..5be76e1dd6f 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21063,11 +21063,25 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
     case UNSPEC:
       if (XINT (x, 1) == UNSPEC_TP)
 	*total = 0;
-      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
+      else if (XINT (x, 1) == UNSPEC_VTERNLOG)
 	{
 	  *total = cost->sse_op;
 	  return true;
 	}
+      else if (XINT (x, 1) == UNSPEC_PTEST)
+	{
+	  *total = cost->sse_op;
+	  if (XVECLEN (x, 0) == 2
+	      && GET_CODE (XVECEXP (x, 0, 0)) == AND)
+	    {
+	      rtx andop = XVECEXP (x, 0, 0);
+	      *total += rtx_cost (XEXP (andop, 0), GET_MODE (andop),
+				  AND, opno, speed)
+			+ rtx_cost (XEXP (andop, 1), GET_MODE (andop),
+				    AND, opno, speed);
+	      return true;
+	    }
+	}
       return false;
 
     case VEC_SELECT:
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fd30c573c27..2fde8cdf48b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9756,6 +9756,27 @@
   [(set_attr "type" "test")
    (set_attr "mode" "QI")])
 
+;; Provide a *testti instruction that STV can implement using ptest.
+;; This pattern splits into *andti3_doubleword and *cmpti_doubleword.
+(define_insn_and_split "*testti_doubleword"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (and:TI (match_operand:TI 0 "register_operand")
+		  (match_operand:TI 1 "general_operand"))
+	  (const_int 0)))]
+  "TARGET_64BIT
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (match_dup 2) (and:TI (match_dup 0) (match_dup 1)))
+              (clobber (reg:CC FLAGS_REG))])
+   (set (reg:CCZ FLAGS_REG) (compare:CCZ (match_dup 2) (const_int 0)))]
+{
+  operands[2] = gen_reg_rtx (TImode);
+  if (!x86_64_hilo_general_operand (operands[1], TImode))
+    operands[1] = force_reg (TImode, operands[1]);
+})
+
 ;; Combine likes to form bit extractions for some tests.  Humor it.
 (define_insn_and_split "*testqi_ext_3"
   [(set (match_operand 0 "flags_reg_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 14d12d16c34..ccd9d002e93 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -23021,6 +23021,19 @@
    (set_attr "prefix" "orig,orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*ptest<mode>_and"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(and:V_AVX (match_operand:V_AVX 0 "register_operand")
+			       (match_operand:V_AVX 1 "vector_operand"))
+		    (and:V_AVX (match_dup 0) (match_dup 1))]
+		   UNSPEC_PTEST))]
+  "TARGET_SSE4_1
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
+
 (define_expand "nearbyint<mode>2"
   [(set (match_operand:VFH 0 "register_operand")
 	(unspec:VFH
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-8.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-8.c
new file mode 100644
index 00000000000..5c5d803797b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-8.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
+
+__int128 a,b;
+int foo()
+{
+  return (a & b) != 0;
+}
+
+/* { dg-final { scan-assembler-not "pand" } } */
+/* { dg-final { scan-assembler "ptest" } } */


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-08-09 18:04 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-09 18:04 [gcc r13-2006] Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).