public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-2447] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0
@ 2023-07-12  7:51 hongtao Liu
  0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2023-07-12  7:51 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:13c556d6ae84be3ee2bc245a56eafa58221de86a

commit r14-2447-g13c556d6ae84be3ee2bc245a56eafa58221de86a
Author: liuhongt <hongtao.liu@intel.com>
Date:   Thu Jun 29 14:25:28 2023 +0800

    Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
    
    False dependency happens when destination is only updated by
    pternlog. There is no false dependency when destination is also used
    in source. So either a pxor should be inserted, or input operand
    should be set with constraint '0'.
    
    gcc/ChangeLog:
    
            PR target/110438
            PR target/110202
            * config/i386/predicates.md
            (int_float_vector_all_ones_operand): New predicate.
            * config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New
            define_insn.
            (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
            Ditto.
            (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
            Ditto.
            (*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to
            define_insn_and_split to avoid false dependence.
            (*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto.
            (<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint
            of operands 1 to '0' to avoid false dependence.
            (*andnot<mode>3): Ditto.
            (iornot<mode>3): Ditto.
            (*<nlogic><mode>3): Ditto.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/pr110438.c: New test.
            * gcc.target/i386/pr100711-6.c: Adjust testcase.

Diff:
---
 gcc/config/i386/predicates.md              |   8 +-
 gcc/config/i386/sse.md                     | 145 ++++++++++++++++++++++++++---
 gcc/testsuite/gcc.target/i386/pr100711-6.c |   2 +-
 gcc/testsuite/gcc.target/i386/pr110438.c   |  30 ++++++
 4 files changed, 168 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@
     return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
        (match_test "INTEGRAL_MODE_P (GET_MODE (op))")
        (match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+       (match_operand 0 "float_vector_all_ones_operand")
+       (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
    that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a2099373123..24359cd189c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@
 	      ]
 	      (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+	(match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
+  && optimize_insn_for_speed_p ()"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+     [(set (match_dup 0) (match_dup 1))
+      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+	(match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL || <MODE_SIZE> == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -9336,7 +9359,7 @@
     operands[3] = CONST0_RTX (<MODE>mode);
   }")
 
-(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
+(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VI48_AVX512VL
 	  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@
   "@
    vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
    vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+  "&& !TARGET_AVX512DQ && reload_completed
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+    [(set (match_dup 0)
+	  (vec_merge:VI48_AVX512VL
+	    (match_dup 2)
+	    (match_dup 3)
+	    (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (<MODE>mode);"
   [(set_attr "isa" "avx512dq,*")
    (set_attr "length_immediate" "0,1")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+	(vec_merge:VI48_AVX512VL
+	  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+	  (match_operand:VI48_AVX512VL 3 "const0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
+   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F && !TARGET_AVX512DQ"
+  "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "extendv2sfv2df2"
   [(set (match_operand:V2DF 0 "register_operand")
 	(float_extend:V2DF
@@ -17164,14 +17211,82 @@
     operands[2] = force_reg (<MODE>mode, operands[2]);
 })
 
-(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
-  [(set (match_operand:VI 0 "register_operand" "=v,v")
-	(xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
-		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
+(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
+  [(set (match_operand:VI 0 "register_operand" "=v,v,v")
+	(xor:VI (match_operand:VI 1 "bcst_vector_operand"     " 0, m,Br")
+		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
   "TARGET_AVX512F
    && (!<mask_applied>
        || <ssescalarmode>mode == SImode
        || <ssescalarmode>mode == DImode)"
+{
+  if (!<mask_applied> && which_alternative
+      && optimize_insn_for_speed_p ())
+    return "#";
+
+  if (TARGET_AVX512VL)
+    return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
+  else
+    return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
+}
+  "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 0) (match_dup 3))
+   (parallel
+     [(set (match_dup 0)
+	   (xor:VI (match_dup 1) (match_dup 2)))
+      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+{
+  if (MEM_P (operands[1]))
+   {
+     operands[3] = operands[1];
+     operands[1] = operands[0];
+   }
+  else
+    {
+      if (GET_MODE_SIZE (<ssescalarmode>mode) < 4)
+	{
+	  if (<MODE_SIZE> == 64 ? TARGET_AVX512BW
+	     : (TARGET_AVX512BW && TARGET_AVX512VL)
+	       || !EXT_REX_SSE_REG_P (operands[0]))
+	    {
+	      operands[3] = operands[1];
+	      operands[1] = operands[0];
+	    }
+	  else
+	    operands[3] = CONST0_RTX (<MODE>mode);
+	}
+       else
+	 {
+	    if (<MODE_SIZE> == 64 || TARGET_AVX512VL
+		|| !EXT_REX_SSE_REG_P (operands[0]))
+	      {
+		operands[3] = operands[1];
+		operands[1] = operands[0];
+	      }
+	    else
+	      operands[3] = CONST0_RTX (<MODE>mode);
+	 }
+    }
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set (attr "mode")
+        (if_then_else (match_test "TARGET_AVX512VL")
+		      (const_string "<sseinsnmode>")
+		      (const_string "XI")))
+   (set (attr "enabled")
+	(if_then_else (eq_attr "alternative" "1")
+		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+		      (const_int 1)))])
+
+(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
+  [(set (match_operand:VI 0 "register_operand" "=v,v,v")
+	(xor:VI (match_operand:VI 1 "bcst_vector_operand"     " 0, m,Br")
+		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))
+   (unspec [(match_operand:VI 3 "register_operand" "0,0,0")]
+     UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F"
 {
   if (TARGET_AVX512VL)
     return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
@@ -17224,7 +17339,7 @@
   [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
 	(and:VI
 	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
-	  (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
+	  (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
   "TARGET_SSE
    && (register_operand (operands[1], <MODE>mode)
        || register_operand (operands[2], <MODE>mode))"
@@ -17683,8 +17798,8 @@
   [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
 	(ior:VI
 	  (not:VI
-	    (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
-	  (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
+	    (match_operand:VI 1 "bcst_vector_operand" "0,m,  0,vBr"))
+	  (match_operand:VI 2 "bcst_vector_operand"   "m,0,vBr,  0")))]
   "(<MODE_SIZE> == 64 || TARGET_AVX512VL
     || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && (register_operand (operands[1], <MODE>mode)
@@ -17708,7 +17823,7 @@
 		      (const_string "<sseinsnmode>")
 		      (const_string "XI")))
    (set (attr "enabled")
-	(if_then_else (eq_attr "alternative" "2,3")
+	(if_then_else (eq_attr "alternative" "0,1")
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
@@ -17716,8 +17831,8 @@
   [(set (match_operand:VI 0 "register_operand" "=v,v")
 	(not:VI
 	  (xor:VI
-	    (match_operand:VI 1 "bcst_vector_operand" "%v,v")
-	    (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+	    (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
+	    (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
   "(<MODE_SIZE> == 64 || TARGET_AVX512VL
     || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && (register_operand (operands[1], <MODE>mode)
@@ -17736,7 +17851,7 @@
 		      (const_string "<sseinsnmode>")
 		      (const_string "XI")))
    (set (attr "enabled")
-	(if_then_else (eq_attr "alternative" "1")
+	(if_then_else (eq_attr "alternative" "0")
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
@@ -17747,8 +17862,8 @@
 (define_insn "*<nlogic><mode>3"
   [(set (match_operand:VI 0 "register_operand" "=v,v")
 	(andor:VI
-	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
-	  (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
+	  (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
   "(<MODE_SIZE> == 64 || TARGET_AVX512VL
     || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && (register_operand (operands[1], <MODE>mode)
@@ -17767,7 +17882,7 @@
 		      (const_string "<sseinsnmode>")
 		      (const_string "XI")))
    (set (attr "enabled")
-	(if_then_else (eq_attr "alternative" "1")
+	(if_then_else (eq_attr "alternative" "0")
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-6.c b/gcc/testsuite/gcc.target/i386/pr100711-6.c
index 7142a98f537..808507471c9 100644
--- a/gcc/testsuite/gcc.target/i386/pr100711-6.c
+++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -Os" } */
 
 typedef int v16si __attribute__ ((vector_size (64)));
 typedef long long v8di __attribute__((vector_size (64)));
diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
new file mode 100644
index 00000000000..11b8cc59fd2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110438.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
+/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
+/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
+
+
+#include <immintrin.h>
+
+__m512i g(void)
+{
+  return (__m512i){ 0 } - 1;
+}
+
+__m512i g1(__m512i* a)
+{
+  return ~(*a);
+}
+
+void
+foo (int* a, int* __restrict b)
+{
+  for (int i = 0; i != 16; i++)
+    {
+      if (b[i])
+	a[i] = -1;
+      else
+	a[i] = 0;
+    }
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-07-12  7:51 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-12  7:51 [gcc r14-2447] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0 hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).