public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: liuhongt <hongtao.liu@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: ubizjak@gmail.com
Subject: [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.
Date: Fri, 27 Oct 2023 13:47:36 +0800	[thread overview]
Message-ID: <20231027054736.3529877-1-hongtao.liu@intel.com> (raw)

When 2 vectors are equal, kmask is allones and kortest will set CF,
else CF will be cleared.

So CF bit can be used to check for the result of the comparison.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

Before:
        vmovdqu (%rsi), %ymm0
        vpxorq  (%rdi), %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        jne     .L2
        vmovdqu 32(%rsi), %ymm0
        vpxorq  32(%rdi), %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        je      .L5
.L2:
        movl    $1, %eax
        xorl    $1, %eax
        vzeroupper
        ret

After:
        vmovdqu64       (%rsi), %zmm0
        xorl    %eax, %eax
        vpcmpeqd        (%rdi), %zmm0, %k0
        kortestw        %k0, %k0
        setc    %al
        vzeroupper
        ret

gcc/ChangeLog:

	PR target/104610
	* config/i386/i386-expand.cc (ix86_expand_branch): Handle
	512-bit vector with vpcmpeq + kortest.
	* config/i386/i386.md (cbranchxi4): New expander.
	* config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
	and V8DImode.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr104610-2.c: New test.
---
 gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
 gcc/config/i386/i386.md                    | 16 +++++++
 gcc/config/i386/sse.md                     | 36 +++++++++++---
 gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
 4 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1eae9d7c78c..c664cb61e80 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   rtx tmp;
 
   /* Handle special case - vector comparsion with boolean result, transform
-     it using ptest instruction.  */
+     it using ptest instruction or vpcmpeq + kortest.  */
   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
       || (mode == TImode && !TARGET_64BIT)
-      || mode == OImode)
+      || mode == OImode
+      || GET_MODE_SIZE (mode) == 64)
     {
-      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
-      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+      unsigned msize = GET_MODE_SIZE (mode);
+      machine_mode p_mode
+	= msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
+      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
+      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
 
       gcc_assert (code == EQ || code == NE);
 
-      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
+      if (msize == 64)
 	{
-	  op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
-	  op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
-	  mode = p_mode;
+	  if (mode != V16SImode)
+	    {
+	      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+	      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+	    }
+
+	  tmp = gen_reg_rtx (HImode);
+	  emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
+	  emit_insn (gen_kortesthi_ccc (tmp, tmp));
+	}
+      /* Using ptest for 128/256-bit vectors.  */
+      else
+	{
+	  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+	    {
+	      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+	      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+	      mode = p_mode;
+	    }
+
+	  /* Generate XOR since we can't check that one operand is zero
+	     vector.  */
+	  tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+	  tmp = gen_lowpart (p_mode, tmp);
+	  emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+				  gen_rtx_UNSPEC (CCZmode,
+						  gen_rtvec (2, tmp, tmp),
+						  UNSPEC_PTEST)));
 	}
-      /* Generate XOR since we can't check that one operand is zero vector.  */
-      tmp = gen_reg_rtx (mode);
-      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
-      tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
-			      gen_rtx_UNSPEC (CCZmode,
-					      gen_rtvec (2, tmp, tmp),
-					      UNSPEC_PTEST)));
       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
 				  gen_rtx_LABEL_REF (VOIDmode, label),
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index abaf2f311e8..51d8d0c3b97 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
   DONE;
 })
 
+(define_expand "cbranchxi4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:XI 1 "nonimmediate_operand")
+		    (match_operand:XI 2 "nonimmediate_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "bt_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cstore<mode>4"
   [(set (reg:CC FLAGS_REG)
 	(compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c988935d4df..88fb1154699 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
    (set_attr "type" "msklog")
    (set_attr "prefix" "vex")])
 
-(define_insn "kortest<mode>"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC
+(define_insn "*kortest<mode>"
+  [(set (reg FLAGS_REG)
+	(unspec
 	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
 	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
 	  UNSPEC_KORTEST))]
@@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
    (set_attr "type" "msklog")
    (set_attr "prefix" "vex")])
 
+(define_insn "kortest<mode>_ccc"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
+(define_insn "kortest<mode>_ccz"
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
+(define_expand "kortest<mode>"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
 (define_insn "kunpckhi"
   [(set (match_operand:HI 0 "register_operand" "=k")
 	(ior:HI
@@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
 
 (define_expand "cbranch<mode>4"
   [(set (reg:CC FLAGS_REG)
-	(compare:CC (match_operand:VI48_AVX 1 "register_operand")
-		    (match_operand:VI48_AVX 2 "nonimmediate_operand")))
+	(compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
+		    (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
    (set (pc) (if_then_else
 	       (match_operator 0 "bt_comparison_operator"
 		[(reg:CC FLAGS_REG) (const_int 0)])
 	       (label_ref (match_operand 3))
 	       (pc)))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
 {
   ix86_expand_branch (GET_CODE (operands[0]),
 		      operands[1], operands[2], operands[3]);
diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
new file mode 100644
index 00000000000..999ef926a18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
+
+int compare (const char* s1, const char* s2)
+{
+  return __builtin_memcmp (s1, s2, 64) == 0;
+}
+
+int compare1 (const char* s1, const char* s2)
+{
+  return __builtin_memcmp (s1, s2, 64) != 0;
+}
-- 
2.31.1


             reply	other threads:[~2023-10-27  5:49 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-27  5:47 liuhongt [this message]
2023-10-27  6:49 ` Richard Biener
2023-10-27  7:21   ` Hongtao Liu
2023-10-27  7:37     ` Hongtao Liu
2023-10-27 10:16     ` Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231027054736.3529877-1-hongtao.liu@intel.com \
    --to=hongtao.liu@intel.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=ubizjak@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).