public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.
@ 2023-10-27  5:47 liuhongt
  2023-10-27  6:49 ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: liuhongt @ 2023-10-27  5:47 UTC (permalink / raw)
  To: gcc-patches; +Cc: ubizjak

When 2 vectors are equal, kmask is allones and kortest will set CF,
else CF will be cleared.

So CF bit can be used to check for the result of the comparison.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

Before:
        vmovdqu (%rsi), %ymm0
        vpxorq  (%rdi), %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        jne     .L2
        vmovdqu 32(%rsi), %ymm0
        vpxorq  32(%rdi), %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        je      .L5
.L2:
        movl    $1, %eax
        xorl    $1, %eax
        vzeroupper
        ret

After:
        vmovdqu64       (%rsi), %zmm0
        xorl    %eax, %eax
        vpcmpeqd        (%rdi), %zmm0, %k0
        kortestw        %k0, %k0
        setc    %al
        vzeroupper
        ret

gcc/ChangeLog:

	PR target/104610
	* config/i386/i386-expand.cc (ix86_expand_branch): Handle
	512-bit vector with vpcmpeq + kortest.
	* config/i386/i386.md (cbranchxi4): New expander.
	* config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
	and V8DImode.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr104610-2.c: New test.
---
 gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
 gcc/config/i386/i386.md                    | 16 +++++++
 gcc/config/i386/sse.md                     | 36 +++++++++++---
 gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
 4 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1eae9d7c78c..c664cb61e80 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   rtx tmp;
 
   /* Handle special case - vector comparsion with boolean result, transform
-     it using ptest instruction.  */
+     it using ptest instruction or vpcmpeq + kortest.  */
   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
       || (mode == TImode && !TARGET_64BIT)
-      || mode == OImode)
+      || mode == OImode
+      || GET_MODE_SIZE (mode) == 64)
     {
-      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
-      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+      unsigned msize = GET_MODE_SIZE (mode);
+      machine_mode p_mode
+	= msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
+      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
+      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
 
       gcc_assert (code == EQ || code == NE);
 
-      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
+      if (msize == 64)
 	{
-	  op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
-	  op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
-	  mode = p_mode;
+	  if (mode != V16SImode)
+	    {
+	      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+	      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+	    }
+
+	  tmp = gen_reg_rtx (HImode);
+	  emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
+	  emit_insn (gen_kortesthi_ccc (tmp, tmp));
+	}
+      /* Using ptest for 128/256-bit vectors.  */
+      else
+	{
+	  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+	    {
+	      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+	      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+	      mode = p_mode;
+	    }
+
+	  /* Generate XOR since we can't check that one operand is zero
+	     vector.  */
+	  tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+	  tmp = gen_lowpart (p_mode, tmp);
+	  emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+				  gen_rtx_UNSPEC (CCZmode,
+						  gen_rtvec (2, tmp, tmp),
+						  UNSPEC_PTEST)));
 	}
-      /* Generate XOR since we can't check that one operand is zero vector.  */
-      tmp = gen_reg_rtx (mode);
-      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
-      tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
-			      gen_rtx_UNSPEC (CCZmode,
-					      gen_rtvec (2, tmp, tmp),
-					      UNSPEC_PTEST)));
       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
 				  gen_rtx_LABEL_REF (VOIDmode, label),
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index abaf2f311e8..51d8d0c3b97 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
   DONE;
 })
 
+(define_expand "cbranchxi4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:XI 1 "nonimmediate_operand")
+		    (match_operand:XI 2 "nonimmediate_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "bt_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cstore<mode>4"
   [(set (reg:CC FLAGS_REG)
 	(compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c988935d4df..88fb1154699 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
    (set_attr "type" "msklog")
    (set_attr "prefix" "vex")])
 
-(define_insn "kortest<mode>"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC
+(define_insn "*kortest<mode>"
+  [(set (reg FLAGS_REG)
+	(unspec
 	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
 	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
 	  UNSPEC_KORTEST))]
@@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
    (set_attr "type" "msklog")
    (set_attr "prefix" "vex")])
 
+(define_insn "kortest<mode>_ccc"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
+(define_insn "kortest<mode>_ccz"
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
+(define_expand "kortest<mode>"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
 (define_insn "kunpckhi"
   [(set (match_operand:HI 0 "register_operand" "=k")
 	(ior:HI
@@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
 
 (define_expand "cbranch<mode>4"
   [(set (reg:CC FLAGS_REG)
-	(compare:CC (match_operand:VI48_AVX 1 "register_operand")
-		    (match_operand:VI48_AVX 2 "nonimmediate_operand")))
+	(compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
+		    (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
    (set (pc) (if_then_else
 	       (match_operator 0 "bt_comparison_operator"
 		[(reg:CC FLAGS_REG) (const_int 0)])
 	       (label_ref (match_operand 3))
 	       (pc)))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
 {
   ix86_expand_branch (GET_CODE (operands[0]),
 		      operands[1], operands[2], operands[3]);
diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
new file mode 100644
index 00000000000..999ef926a18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
+
+int compare (const char* s1, const char* s2)
+{
+  return __builtin_memcmp (s1, s2, 64) == 0;
+}
+
+int compare1 (const char* s1, const char* s2)
+{
+  return __builtin_memcmp (s1, s2, 64) != 0;
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.
  2023-10-27  5:47 [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest liuhongt
@ 2023-10-27  6:49 ` Richard Biener
  2023-10-27  7:21   ` Hongtao Liu
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2023-10-27  6:49 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches, ubizjak



> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
> 
> When 2 vectors are equal, kmask is allones and kortest will set CF,
> else CF will be cleared.
> 
> So CF bit can be used to check for the result of the comparison.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

Is that also profitable for 256bit aka AVX10?
Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)

> Before:
>        vmovdqu (%rsi), %ymm0
>        vpxorq  (%rdi), %ymm0, %ymm0
>        vptest  %ymm0, %ymm0
>        jne     .L2
>        vmovdqu 32(%rsi), %ymm0
>        vpxorq  32(%rdi), %ymm0, %ymm0
>        vptest  %ymm0, %ymm0
>        je      .L5
> .L2:
>        movl    $1, %eax
>        xorl    $1, %eax
>        vzeroupper
>        ret
> 
> After:
>        vmovdqu64       (%rsi), %zmm0
>        xorl    %eax, %eax
>        vpcmpeqd        (%rdi), %zmm0, %k0
>        kortestw        %k0, %k0
>        setc    %al
>        vzeroupper
>        ret
> 
> gcc/ChangeLog:
> 
>    PR target/104610
>    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
>    512-bit vector with vpcmpeq + kortest.
>    * config/i386/i386.md (cbranchxi4): New expander.
>    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
>    and V8DImode.
> 
> gcc/testsuite/ChangeLog:
> 
>    * gcc.target/i386/pr104610-2.c: New test.
> ---
> gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> gcc/config/i386/i386.md                    | 16 +++++++
> gcc/config/i386/sse.md                     | 36 +++++++++++---
> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> 4 files changed, 99 insertions(+), 22 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> 
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 1eae9d7c78c..c664cb61e80 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
>   rtx tmp;
> 
>   /* Handle special case - vector comparsion with boolean result, transform
> -     it using ptest instruction.  */
> +     it using ptest instruction or vpcmpeq + kortest.  */
>   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>       || (mode == TImode && !TARGET_64BIT)
> -      || mode == OImode)
> +      || mode == OImode
> +      || GET_MODE_SIZE (mode) == 64)
>     {
> -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> +      unsigned msize = GET_MODE_SIZE (mode);
> +      machine_mode p_mode
> +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
> 
>       gcc_assert (code == EQ || code == NE);
> 
> -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> +      if (msize == 64)
>    {
> -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> -      mode = p_mode;
> +      if (mode != V16SImode)
> +        {
> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> +        }
> +
> +      tmp = gen_reg_rtx (HImode);
> +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> +    }
> +      /* Using ptest for 128/256-bit vectors.  */
> +      else
> +    {
> +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> +        {
> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> +          mode = p_mode;
> +        }
> +
> +      /* Generate XOR since we can't check that one operand is zero
> +         vector.  */
> +      tmp = gen_reg_rtx (mode);
> +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> +      tmp = gen_lowpart (p_mode, tmp);
> +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> +                  gen_rtx_UNSPEC (CCZmode,
> +                          gen_rtvec (2, tmp, tmp),
> +                          UNSPEC_PTEST)));
>    }
> -      /* Generate XOR since we can't check that one operand is zero vector.  */
> -      tmp = gen_reg_rtx (mode);
> -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> -      tmp = gen_lowpart (p_mode, tmp);
> -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> -                  gen_rtx_UNSPEC (CCZmode,
> -                          gen_rtvec (2, tmp, tmp),
> -                          UNSPEC_PTEST)));
>       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
>       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
>                  gen_rtx_LABEL_REF (VOIDmode, label),
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index abaf2f311e8..51d8d0c3b97 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
>   DONE;
> })
> 
> +(define_expand "cbranchxi4"
> +  [(set (reg:CC FLAGS_REG)
> +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> +            (match_operand:XI 2 "nonimmediate_operand")))
> +   (set (pc) (if_then_else
> +           (match_operator 0 "bt_comparison_operator"
> +        [(reg:CC FLAGS_REG) (const_int 0)])
> +           (label_ref (match_operand 3))
> +           (pc)))]
> +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> +{
> +  ix86_expand_branch (GET_CODE (operands[0]),
> +              operands[1], operands[2], operands[3]);
> +  DONE;
> +})
> +
> (define_expand "cstore<mode>4"
>   [(set (reg:CC FLAGS_REG)
>    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index c988935d4df..88fb1154699 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
>    (set_attr "type" "msklog")
>    (set_attr "prefix" "vex")])
> 
> -(define_insn "kortest<mode>"
> -  [(set (reg:CC FLAGS_REG)
> -    (unspec:CC
> +(define_insn "*kortest<mode>"
> +  [(set (reg FLAGS_REG)
> +    (unspec
>      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
>       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
>      UNSPEC_KORTEST))]
> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
>    (set_attr "type" "msklog")
>    (set_attr "prefix" "vex")])
> 
> +(define_insn "kortest<mode>_ccc"
> +  [(set (reg:CCC FLAGS_REG)
> +    (unspec:CCC
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> +(define_insn "kortest<mode>_ccz"
> +  [(set (reg:CCZ FLAGS_REG)
> +    (unspec:CCZ
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> +(define_expand "kortest<mode>"
> +  [(set (reg:CC FLAGS_REG)
> +    (unspec:CC
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> (define_insn "kunpckhi"
>   [(set (match_operand:HI 0 "register_operand" "=k")
>    (ior:HI
> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> 
> (define_expand "cbranch<mode>4"
>   [(set (reg:CC FLAGS_REG)
> -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
>    (set (pc) (if_then_else
>           (match_operator 0 "bt_comparison_operator"
>        [(reg:CC FLAGS_REG) (const_int 0)])
>           (label_ref (match_operand 3))
>           (pc)))]
> -  "TARGET_SSE4_1"
> +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> {
>   ix86_expand_branch (GET_CODE (operands[0]),
>              operands[1], operands[2], operands[3]);
> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> new file mode 100644
> index 00000000000..999ef926a18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> +
> +int compare (const char* s1, const char* s2)
> +{
> +  return __builtin_memcmp (s1, s2, 64) == 0;
> +}
> +
> +int compare1 (const char* s1, const char* s2)
> +{
> +  return __builtin_memcmp (s1, s2, 64) != 0;
> +}
> -- 
> 2.31.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.
  2023-10-27  6:49 ` Richard Biener
@ 2023-10-27  7:21   ` Hongtao Liu
  2023-10-27  7:37     ` Hongtao Liu
  2023-10-27 10:16     ` Richard Biener
  0 siblings, 2 replies; 5+ messages in thread
From: Hongtao Liu @ 2023-10-27  7:21 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, gcc-patches, ubizjak

On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
>
>
> > Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
> >
> > When 2 vectors are equal, kmask is allones and kortest will set CF,
> > else CF will be cleared.
> >
> > So CF bit can be used to check for the result of the comparison.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> Is that also profitable for 256bit aka AVX10?
Yes, it's also available for both 128-bit and 256-bit with AVX10, from
performance perspective it's better.
AVX10:
  vpcmp + kortest
 vs
AVX2:
 vpxor + vptest

 vptest is more expensive than vpcmp + kortest

> Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)
There are JC and JNC, there're many pattern matches for ptest which
can't be automatically adjusted to kortest by combining, backend needs
to manually transform them.
That's why my patch only handles 64-bit vectors(to avoid regressing
those pattern match stuff).

>
> > Before:
> >        vmovdqu (%rsi), %ymm0
> >        vpxorq  (%rdi), %ymm0, %ymm0
> >        vptest  %ymm0, %ymm0
> >        jne     .L2
> >        vmovdqu 32(%rsi), %ymm0
> >        vpxorq  32(%rdi), %ymm0, %ymm0
> >        vptest  %ymm0, %ymm0
> >        je      .L5
> > .L2:
> >        movl    $1, %eax
> >        xorl    $1, %eax
> >        vzeroupper
> >        ret
> >
> > After:
> >        vmovdqu64       (%rsi), %zmm0
> >        xorl    %eax, %eax
> >        vpcmpeqd        (%rdi), %zmm0, %k0
> >        kortestw        %k0, %k0
> >        setc    %al
> >        vzeroupper
> >        ret
> >
> > gcc/ChangeLog:
> >
> >    PR target/104610
> >    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
> >    512-bit vector with vpcmpeq + kortest.
> >    * config/i386/i386.md (cbranchxi4): New expander.
> >    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
> >    and V8DImode.
> >
> > gcc/testsuite/ChangeLog:
> >
> >    * gcc.target/i386/pr104610-2.c: New test.
> > ---
> > gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> > gcc/config/i386/i386.md                    | 16 +++++++
> > gcc/config/i386/sse.md                     | 36 +++++++++++---
> > gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> > 4 files changed, 99 insertions(+), 22 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index 1eae9d7c78c..c664cb61e80 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
> >   rtx tmp;
> >
> >   /* Handle special case - vector comparsion with boolean result, transform
> > -     it using ptest instruction.  */
> > +     it using ptest instruction or vpcmpeq + kortest.  */
> >   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> >       || (mode == TImode && !TARGET_64BIT)
> > -      || mode == OImode)
> > +      || mode == OImode
> > +      || GET_MODE_SIZE (mode) == 64)
> >     {
> > -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> > -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> > +      unsigned msize = GET_MODE_SIZE (mode);
> > +      machine_mode p_mode
> > +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> > +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> > +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
> >
> >       gcc_assert (code == EQ || code == NE);
> >
> > -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> > +      if (msize == 64)
> >    {
> > -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > -      mode = p_mode;
> > +      if (mode != V16SImode)
> > +        {
> > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > +        }
> > +
> > +      tmp = gen_reg_rtx (HImode);
> > +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> > +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> > +    }
> > +      /* Using ptest for 128/256-bit vectors.  */
> > +      else
> > +    {
> > +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > +        {
> > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > +          mode = p_mode;
> > +        }
> > +
> > +      /* Generate XOR since we can't check that one operand is zero
> > +         vector.  */
> > +      tmp = gen_reg_rtx (mode);
> > +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > +      tmp = gen_lowpart (p_mode, tmp);
> > +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > +                  gen_rtx_UNSPEC (CCZmode,
> > +                          gen_rtvec (2, tmp, tmp),
> > +                          UNSPEC_PTEST)));
> >    }
> > -      /* Generate XOR since we can't check that one operand is zero vector.  */
> > -      tmp = gen_reg_rtx (mode);
> > -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > -      tmp = gen_lowpart (p_mode, tmp);
> > -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > -                  gen_rtx_UNSPEC (CCZmode,
> > -                          gen_rtvec (2, tmp, tmp),
> > -                          UNSPEC_PTEST)));
> >       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
> >       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
> >                  gen_rtx_LABEL_REF (VOIDmode, label),
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index abaf2f311e8..51d8d0c3b97 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
> >   DONE;
> > })
> >
> > +(define_expand "cbranchxi4"
> > +  [(set (reg:CC FLAGS_REG)
> > +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> > +            (match_operand:XI 2 "nonimmediate_operand")))
> > +   (set (pc) (if_then_else
> > +           (match_operator 0 "bt_comparison_operator"
> > +        [(reg:CC FLAGS_REG) (const_int 0)])
> > +           (label_ref (match_operand 3))
> > +           (pc)))]
> > +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> > +{
> > +  ix86_expand_branch (GET_CODE (operands[0]),
> > +              operands[1], operands[2], operands[3]);
> > +  DONE;
> > +})
> > +
> > (define_expand "cstore<mode>4"
> >   [(set (reg:CC FLAGS_REG)
> >    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index c988935d4df..88fb1154699 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
> >    (set_attr "type" "msklog")
> >    (set_attr "prefix" "vex")])
> >
> > -(define_insn "kortest<mode>"
> > -  [(set (reg:CC FLAGS_REG)
> > -    (unspec:CC
> > +(define_insn "*kortest<mode>"
> > +  [(set (reg FLAGS_REG)
> > +    (unspec
> >      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
> >       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
> >      UNSPEC_KORTEST))]
> > @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
> >    (set_attr "type" "msklog")
> >    (set_attr "prefix" "vex")])
> >
> > +(define_insn "kortest<mode>_ccc"
> > +  [(set (reg:CCC FLAGS_REG)
> > +    (unspec:CCC
> > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > +      UNSPEC_KORTEST))]
> > +  "TARGET_AVX512F")
> > +
> > +(define_insn "kortest<mode>_ccz"
> > +  [(set (reg:CCZ FLAGS_REG)
> > +    (unspec:CCZ
> > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > +      UNSPEC_KORTEST))]
> > +  "TARGET_AVX512F")
> > +
> > +(define_expand "kortest<mode>"
> > +  [(set (reg:CC FLAGS_REG)
> > +    (unspec:CC
> > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > +      UNSPEC_KORTEST))]
> > +  "TARGET_AVX512F")
> > +
> > (define_insn "kunpckhi"
> >   [(set (match_operand:HI 0 "register_operand" "=k")
> >    (ior:HI
> > @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> >
> > (define_expand "cbranch<mode>4"
> >   [(set (reg:CC FLAGS_REG)
> > -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> > -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> > +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> > +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
> >    (set (pc) (if_then_else
> >           (match_operator 0 "bt_comparison_operator"
> >        [(reg:CC FLAGS_REG) (const_int 0)])
> >           (label_ref (match_operand 3))
> >           (pc)))]
> > -  "TARGET_SSE4_1"
> > +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> > {
> >   ix86_expand_branch (GET_CODE (operands[0]),
> >              operands[1], operands[2], operands[3]);
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > new file mode 100644
> > index 00000000000..999ef926a18
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> > +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> > +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> > +
> > +int compare (const char* s1, const char* s2)
> > +{
> > +  return __builtin_memcmp (s1, s2, 64) == 0;
> > +}
> > +
> > +int compare1 (const char* s1, const char* s2)
> > +{
> > +  return __builtin_memcmp (s1, s2, 64) != 0;
> > +}
> > --
> > 2.31.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.
  2023-10-27  7:21   ` Hongtao Liu
@ 2023-10-27  7:37     ` Hongtao Liu
  2023-10-27 10:16     ` Richard Biener
  1 sibling, 0 replies; 5+ messages in thread
From: Hongtao Liu @ 2023-10-27  7:37 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, gcc-patches, ubizjak

On Fri, Oct 27, 2023 at 3:21 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> >
> >
> > > Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
> > >
> > > When 2 vectors are equal, kmask is allones and kortest will set CF,
> > > else CF will be cleared.
> > >
> > > So CF bit can be used to check for the result of the comparison.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> >
> > Is that also profitable for 256bit aka AVX10?
> Yes, it's also available for both 128-bit and 256-bit with AVX10, from
> performance perspective it's better.
> AVX10:
>   vpcmp + kortest
>  vs
> AVX2:
>  vpxor + vptest
>
>  vptest is more expensive than vpcmp + kortest
>
> > Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)
> There are JC and JNC, there're many pattern matches for ptest which
> can't be automatically adjusted to kortest by combining, backend needs
> to manually transform them.
> That's why my patch only handles 64-bit vectors(to avoid regressing
I mean 64 bytes.
> those pattern match stuff).
>
> >
> > > Before:
> > >        vmovdqu (%rsi), %ymm0
> > >        vpxorq  (%rdi), %ymm0, %ymm0
> > >        vptest  %ymm0, %ymm0
> > >        jne     .L2
> > >        vmovdqu 32(%rsi), %ymm0
> > >        vpxorq  32(%rdi), %ymm0, %ymm0
> > >        vptest  %ymm0, %ymm0
> > >        je      .L5
> > > .L2:
> > >        movl    $1, %eax
> > >        xorl    $1, %eax
> > >        vzeroupper
> > >        ret
> > >
> > > After:
> > >        vmovdqu64       (%rsi), %zmm0
> > >        xorl    %eax, %eax
> > >        vpcmpeqd        (%rdi), %zmm0, %k0
> > >        kortestw        %k0, %k0
> > >        setc    %al
> > >        vzeroupper
> > >        ret
> > >
> > > gcc/ChangeLog:
> > >
> > >    PR target/104610
> > >    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
> > >    512-bit vector with vpcmpeq + kortest.
> > >    * config/i386/i386.md (cbranchxi4): New expander.
> > >    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
> > >    and V8DImode.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >    * gcc.target/i386/pr104610-2.c: New test.
> > > ---
> > > gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> > > gcc/config/i386/i386.md                    | 16 +++++++
> > > gcc/config/i386/sse.md                     | 36 +++++++++++---
> > > gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> > > 4 files changed, 99 insertions(+), 22 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > index 1eae9d7c78c..c664cb61e80 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
> > >   rtx tmp;
> > >
> > >   /* Handle special case - vector comparsion with boolean result, transform
> > > -     it using ptest instruction.  */
> > > +     it using ptest instruction or vpcmpeq + kortest.  */
> > >   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > >       || (mode == TImode && !TARGET_64BIT)
> > > -      || mode == OImode)
> > > +      || mode == OImode
> > > +      || GET_MODE_SIZE (mode) == 64)
> > >     {
> > > -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> > > -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> > > +      unsigned msize = GET_MODE_SIZE (mode);
> > > +      machine_mode p_mode
> > > +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> > > +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> > > +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
> > >
> > >       gcc_assert (code == EQ || code == NE);
> > >
> > > -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > > +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> > > +      if (msize == 64)
> > >    {
> > > -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > -      mode = p_mode;
> > > +      if (mode != V16SImode)
> > > +        {
> > > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > +        }
> > > +
> > > +      tmp = gen_reg_rtx (HImode);
> > > +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> > > +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> > > +    }
> > > +      /* Using ptest for 128/256-bit vectors.  */
> > > +      else
> > > +    {
> > > +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > > +        {
> > > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > +          mode = p_mode;
> > > +        }
> > > +
> > > +      /* Generate XOR since we can't check that one operand is zero
> > > +         vector.  */
> > > +      tmp = gen_reg_rtx (mode);
> > > +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > > +      tmp = gen_lowpart (p_mode, tmp);
> > > +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > > +                  gen_rtx_UNSPEC (CCZmode,
> > > +                          gen_rtvec (2, tmp, tmp),
> > > +                          UNSPEC_PTEST)));
> > >    }
> > > -      /* Generate XOR since we can't check that one operand is zero vector.  */
> > > -      tmp = gen_reg_rtx (mode);
> > > -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > > -      tmp = gen_lowpart (p_mode, tmp);
> > > -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > > -                  gen_rtx_UNSPEC (CCZmode,
> > > -                          gen_rtvec (2, tmp, tmp),
> > > -                          UNSPEC_PTEST)));
> > >       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
> > >       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
> > >                  gen_rtx_LABEL_REF (VOIDmode, label),
> > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > index abaf2f311e8..51d8d0c3b97 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
> > >   DONE;
> > > })
> > >
> > > +(define_expand "cbranchxi4"
> > > +  [(set (reg:CC FLAGS_REG)
> > > +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> > > +            (match_operand:XI 2 "nonimmediate_operand")))
> > > +   (set (pc) (if_then_else
> > > +           (match_operator 0 "bt_comparison_operator"
> > > +        [(reg:CC FLAGS_REG) (const_int 0)])
> > > +           (label_ref (match_operand 3))
> > > +           (pc)))]
> > > +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> > > +{
> > > +  ix86_expand_branch (GET_CODE (operands[0]),
> > > +              operands[1], operands[2], operands[3]);
> > > +  DONE;
> > > +})
> > > +
> > > (define_expand "cstore<mode>4"
> > >   [(set (reg:CC FLAGS_REG)
> > >    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index c988935d4df..88fb1154699 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
> > >    (set_attr "type" "msklog")
> > >    (set_attr "prefix" "vex")])
> > >
> > > -(define_insn "kortest<mode>"
> > > -  [(set (reg:CC FLAGS_REG)
> > > -    (unspec:CC
> > > +(define_insn "*kortest<mode>"
> > > +  [(set (reg FLAGS_REG)
> > > +    (unspec
> > >      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
> > >       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
> > >      UNSPEC_KORTEST))]
> > > @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
> > >    (set_attr "type" "msklog")
> > >    (set_attr "prefix" "vex")])
> > >
> > > +(define_insn "kortest<mode>_ccc"
> > > +  [(set (reg:CCC FLAGS_REG)
> > > +    (unspec:CCC
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > +(define_insn "kortest<mode>_ccz"
> > > +  [(set (reg:CCZ FLAGS_REG)
> > > +    (unspec:CCZ
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > +(define_expand "kortest<mode>"
> > > +  [(set (reg:CC FLAGS_REG)
> > > +    (unspec:CC
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > (define_insn "kunpckhi"
> > >   [(set (match_operand:HI 0 "register_operand" "=k")
> > >    (ior:HI
> > > @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> > >
> > > (define_expand "cbranch<mode>4"
> > >   [(set (reg:CC FLAGS_REG)
> > > -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> > > -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> > > +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> > > +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
> > >    (set (pc) (if_then_else
> > >           (match_operator 0 "bt_comparison_operator"
> > >        [(reg:CC FLAGS_REG) (const_int 0)])
> > >           (label_ref (match_operand 3))
> > >           (pc)))]
> > > -  "TARGET_SSE4_1"
> > > +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> > > {
> > >   ix86_expand_branch (GET_CODE (operands[0]),
> > >              operands[1], operands[2], operands[3]);
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > > new file mode 100644
> > > index 00000000000..999ef926a18
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > > @@ -0,0 +1,14 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> > > +
> > > +int compare (const char* s1, const char* s2)
> > > +{
> > > +  return __builtin_memcmp (s1, s2, 64) == 0;
> > > +}
> > > +
> > > +int compare1 (const char* s1, const char* s2)
> > > +{
> > > +  return __builtin_memcmp (s1, s2, 64) != 0;
> > > +}
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.
  2023-10-27  7:21   ` Hongtao Liu
  2023-10-27  7:37     ` Hongtao Liu
@ 2023-10-27 10:16     ` Richard Biener
  1 sibling, 0 replies; 5+ messages in thread
From: Richard Biener @ 2023-10-27 10:16 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches, ubizjak



> Am 27.10.2023 um 09:13 schrieb Hongtao Liu <crazylht@gmail.com>:
> 
> On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
>> 
>> 
>> 
>>>> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
>>> 
>>> When 2 vectors are equal, kmask is allones and kortest will set CF,
>>> else CF will be cleared.
>>> 
>>> So CF bit can be used to check for the result of the comparison.
>>> 
>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>>> Ok for trunk?
>> 
>> Is that also profitable for 256bit aka AVX10?
> Yes, it's also available for both 128-bit and 256-bit with AVX10, from
> performance perspective it's better.
> AVX10:
>  vpcmp + kortest
> vs
> AVX2:
> vpxor + vptest
> 
> vptest is more expensive than vpcmp + kortest
> 
>> Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)
> There are JC and JNC, there're many pattern matches for ptest which
> can't be automatically adjusted to kortest by combining, backend needs
> to manually transform them.
> That's why my patch only handles 64-bit vectors(to avoid regressing
> those pattern match stuff).

Ah, I see.  That’s exactly what I was wondering.

Richard 

> 
>> 
>>> Before:
>>>       vmovdqu (%rsi), %ymm0
>>>       vpxorq  (%rdi), %ymm0, %ymm0
>>>       vptest  %ymm0, %ymm0
>>>       jne     .L2
>>>       vmovdqu 32(%rsi), %ymm0
>>>       vpxorq  32(%rdi), %ymm0, %ymm0
>>>       vptest  %ymm0, %ymm0
>>>       je      .L5
>>> .L2:
>>>       movl    $1, %eax
>>>       xorl    $1, %eax
>>>       vzeroupper
>>>       ret
>>> 
>>> After:
>>>       vmovdqu64       (%rsi), %zmm0
>>>       xorl    %eax, %eax
>>>       vpcmpeqd        (%rdi), %zmm0, %k0
>>>       kortestw        %k0, %k0
>>>       setc    %al
>>>       vzeroupper
>>>       ret
>>> 
>>> gcc/ChangeLog:
>>> 
>>>   PR target/104610
>>>   * config/i386/i386-expand.cc (ix86_expand_branch): Handle
>>>   512-bit vector with vpcmpeq + kortest.
>>>   * config/i386/i386.md (cbranchxi4): New expander.
>>>   * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
>>>   and V8DImode.
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>>   * gcc.target/i386/pr104610-2.c: New test.
>>> ---
>>> gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
>>> gcc/config/i386/i386.md                    | 16 +++++++
>>> gcc/config/i386/sse.md                     | 36 +++++++++++---
>>> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
>>> 4 files changed, 99 insertions(+), 22 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> 
>>> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
>>> index 1eae9d7c78c..c664cb61e80 100644
>>> --- a/gcc/config/i386/i386-expand.cc
>>> +++ b/gcc/config/i386/i386-expand.cc
>>> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
>>>  rtx tmp;
>>> 
>>>  /* Handle special case - vector comparsion with boolean result, transform
>>> -     it using ptest instruction.  */
>>> +     it using ptest instruction or vpcmpeq + kortest.  */
>>>  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>>>      || (mode == TImode && !TARGET_64BIT)
>>> -      || mode == OImode)
>>> +      || mode == OImode
>>> +      || GET_MODE_SIZE (mode) == 64)
>>>    {
>>> -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
>>> -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
>>> +      unsigned msize = GET_MODE_SIZE (mode);
>>> +      machine_mode p_mode
>>> +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
>>> +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
>>> +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
>>> 
>>>      gcc_assert (code == EQ || code == NE);
>>> 
>>> -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
>>> +      if (msize == 64)
>>>   {
>>> -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> -      mode = p_mode;
>>> +      if (mode != V16SImode)
>>> +        {
>>> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> +        }
>>> +
>>> +      tmp = gen_reg_rtx (HImode);
>>> +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
>>> +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
>>> +    }
>>> +      /* Using ptest for 128/256-bit vectors.  */
>>> +      else
>>> +    {
>>> +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> +        {
>>> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> +          mode = p_mode;
>>> +        }
>>> +
>>> +      /* Generate XOR since we can't check that one operand is zero
>>> +         vector.  */
>>> +      tmp = gen_reg_rtx (mode);
>>> +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> +      tmp = gen_lowpart (p_mode, tmp);
>>> +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> +                  gen_rtx_UNSPEC (CCZmode,
>>> +                          gen_rtvec (2, tmp, tmp),
>>> +                          UNSPEC_PTEST)));
>>>   }
>>> -      /* Generate XOR since we can't check that one operand is zero vector.  */
>>> -      tmp = gen_reg_rtx (mode);
>>> -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> -      tmp = gen_lowpart (p_mode, tmp);
>>> -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> -                  gen_rtx_UNSPEC (CCZmode,
>>> -                          gen_rtvec (2, tmp, tmp),
>>> -                          UNSPEC_PTEST)));
>>>      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
>>>      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
>>>                 gen_rtx_LABEL_REF (VOIDmode, label),
>>> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
>>> index abaf2f311e8..51d8d0c3b97 100644
>>> --- a/gcc/config/i386/i386.md
>>> +++ b/gcc/config/i386/i386.md
>>> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
>>>  DONE;
>>> })
>>> 
>>> +(define_expand "cbranchxi4"
>>> +  [(set (reg:CC FLAGS_REG)
>>> +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
>>> +            (match_operand:XI 2 "nonimmediate_operand")))
>>> +   (set (pc) (if_then_else
>>> +           (match_operator 0 "bt_comparison_operator"
>>> +        [(reg:CC FLAGS_REG) (const_int 0)])
>>> +           (label_ref (match_operand 3))
>>> +           (pc)))]
>>> +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
>>> +{
>>> +  ix86_expand_branch (GET_CODE (operands[0]),
>>> +              operands[1], operands[2], operands[3]);
>>> +  DONE;
>>> +})
>>> +
>>> (define_expand "cstore<mode>4"
>>>  [(set (reg:CC FLAGS_REG)
>>>   (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
>>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>>> index c988935d4df..88fb1154699 100644
>>> --- a/gcc/config/i386/sse.md
>>> +++ b/gcc/config/i386/sse.md
>>> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
>>>   (set_attr "type" "msklog")
>>>   (set_attr "prefix" "vex")])
>>> 
>>> -(define_insn "kortest<mode>"
>>> -  [(set (reg:CC FLAGS_REG)
>>> -    (unspec:CC
>>> +(define_insn "*kortest<mode>"
>>> +  [(set (reg FLAGS_REG)
>>> +    (unspec
>>>     [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
>>>      (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
>>>     UNSPEC_KORTEST))]
>>> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
>>>   (set_attr "type" "msklog")
>>>   (set_attr "prefix" "vex")])
>>> 
>>> +(define_insn "kortest<mode>_ccc"
>>> +  [(set (reg:CCC FLAGS_REG)
>>> +    (unspec:CCC
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> +(define_insn "kortest<mode>_ccz"
>>> +  [(set (reg:CCZ FLAGS_REG)
>>> +    (unspec:CCZ
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> +(define_expand "kortest<mode>"
>>> +  [(set (reg:CC FLAGS_REG)
>>> +    (unspec:CC
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> (define_insn "kunpckhi"
>>>  [(set (match_operand:HI 0 "register_operand" "=k")
>>>   (ior:HI
>>> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
>>> 
>>> (define_expand "cbranch<mode>4"
>>>  [(set (reg:CC FLAGS_REG)
>>> -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
>>> -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
>>> +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
>>> +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
>>>   (set (pc) (if_then_else
>>>          (match_operator 0 "bt_comparison_operator"
>>>       [(reg:CC FLAGS_REG) (const_int 0)])
>>>          (label_ref (match_operand 3))
>>>          (pc)))]
>>> -  "TARGET_SSE4_1"
>>> +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
>>> {
>>>  ix86_expand_branch (GET_CODE (operands[0]),
>>>             operands[1], operands[2], operands[3]);
>>> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> new file mode 100644
>>> index 00000000000..999ef926a18
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> @@ -0,0 +1,14 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
>>> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
>>> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
>>> +
>>> +int compare (const char* s1, const char* s2)
>>> +{
>>> +  return __builtin_memcmp (s1, s2, 64) == 0;
>>> +}
>>> +
>>> +int compare1 (const char* s1, const char* s2)
>>> +{
>>> +  return __builtin_memcmp (s1, s2, 64) != 0;
>>> +}
>>> --
>>> 2.31.1
>>> 
> 
> 
> 
> -- 
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-10-27 10:17 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-27  5:47 [PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest liuhongt
2023-10-27  6:49 ` Richard Biener
2023-10-27  7:21   ` Hongtao Liu
2023-10-27  7:37     ` Hongtao Liu
2023-10-27 10:16     ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).