public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [i386] GLC tuning: Break false dependency for dest register.
@ 2022-01-13  7:28 Hongyu Wang
  2022-01-13  7:41 ` Uros Bizjak
  0 siblings, 1 reply; 16+ messages in thread
From: Hongyu Wang @ 2022-01-13  7:28 UTC (permalink / raw)
  To: hongtao.liu; +Cc: gcc-patches, ubizjak, wwwhhhyyy

From: wwwhhhyyy <hongyu.wang@intel.com>

Hi,

For GoldenCove micro-architecture, force insert zero-idiom in asm
template to break false dependency of dest register for several insns.

The related insns are:

VPERM/D/Q/PS/PD
VRANGEPD/PS/SD/SS
VGETMANTSS/SD/SH
VGETMANDPS/PD - mem version only
VPMULLQ
VFMULCSH/PH
VFCMULCSH/PH

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}

Ok for master?

gcc/ChangeLog:

	* config/i386/i386.h (TARGET_DEST_FALSE_DEPENDENCY): New macro.
	* config/i386/i386.md (dest_false_dep): New define_attr.
	* config/i386/sse.md (<avx512>_<complexopname>_<mode><maskc_name><round_name>):
	Insert zero-idiom in output template when attr enabled, set new attribute to
	true for non-mask/maskz insn.
	(avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>):
	Likewise.
	(avx512dq_mul<mode>3<mask_name>): Likewise.
	(<avx2_avx512>_permvar<mode><mask_name>): Likewise.
	(avx2_perm<mode>_1<mask_name>): Likewise.
	(avx512f_perm<mode>_1<mask_name>): Likewise.
	(avx512dq_rangep<mode><mask_name><round_saeonly_name>): Likewise.
	(avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>):
	Likewise.
	(<avx512>_getmant<mode><mask_name><round_saeonly_name>): Likewise.
	(avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
	Likewise.
	* config/i386/subst.md (mask3_dest_false_dep_attr): New subst_attr.
	(mask4_dest_false_dep_attr): Likewise.
	(mask6_dest_false_dep_attr): Likewise.
	(mask10_dest_false_dep_attr): Likewise.
	(maskc_dest_false_dep_attr): Likewise.
	(mask_scalar4_dest_false_dep_attr): Likewise.
	(mask_scalarc_dest_false_dep_attr): Likewise.
	* config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEPENDENCY): New
	DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx2-dest-false-dependency.c: New test.
	* gcc.target/i386/avx512dq-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512f-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512fp16-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512fp16vl-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512vl-dest-false-dependency.c: Ditto.
---
 gcc/config/i386/i386.h                        |   2 +
 gcc/config/i386/i386.md                       |   4 +
 gcc/config/i386/sse.md                        | 142 +++++++++++++++---
 gcc/config/i386/subst.md                      |   7 +
 gcc/config/i386/x86-tune.def                  |   5 +
 .../i386/avx2-dest-false-dependency.c         |  24 +++
 .../i386/avx512dq-dest-false-dependency.c     |  73 +++++++++
 .../i386/avx512f-dest-false-dependency.c      | 102 +++++++++++++
 .../i386/avx512fp16-dest-false-dependency.c   |  45 ++++++
 .../i386/avx512fp16vl-dest-false-dependency.c |  24 +++
 .../i386/avx512vl-dest-false-dependency.c     |  76 ++++++++++
 11 files changed, 486 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3ac0f698ae2..ddbf6b9825a 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_EXPAND_ABS]
 #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \
 	ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
+#define TARGET_DEST_FALSE_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_DEST_FALSE_DEPENDENCY]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9937643a273..40a2b580740 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -823,6 +823,10 @@ (define_attr "i387_cw" "roundeven,floor,ceil,trunc,uninitialized,any"
 (define_attr "avx_partial_xmm_update" "false,true"
   (const_string "false"))
 
+;; Define attribute to indicate complex mult insn with false dependency
+(define_attr "dest_false_dep" "false,true"
+ (const_string "false"))
+
 ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
 (define_attr "use_carry" "0,1" (const_string "0"))
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0864748875e..c8dace5b2f8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6536,9 +6536,20 @@ (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
 	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
 	     UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
-  "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
+}
   [(set_attr "type" "ssemul")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<maskc_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>"
   [(match_operand:V8HF 0 "register_operand")
@@ -6742,9 +6753,20 @@ (define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarc
 	    (match_dup 1)
 	    (const_int 3)))]
   "TARGET_AVX512FP16"
-  "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
+}
   [(set_attr "type" "ssemul")
-   (set_attr "mode" "V8HF")])
+   (set_attr "mode" "V8HF")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask_scalarc_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -15207,10 +15229,21 @@ (define_insn "avx512dq_mul<mode>3<mask_name>"
 	  (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))]
   "TARGET_AVX512DQ && <mask_mode512bit_condition>
   && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
-  "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
+}
   [(set_attr "type" "sseimul")
    (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask3_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_expand "cond_mul<mode>"
   [(set (match_operand:VI4_AVX512VL 0 "register_operand")
@@ -24636,10 +24669,21 @@ (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
 	   (match_operand:<sseintvecmode> 2 "register_operand" "v")]
 	  UNSPEC_VPERMVAR))]
   "TARGET_AVX2 && <mask_mode512bit_condition>"
-  "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix" "<mask_prefix2>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask3_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "<avx512>_permvar<mode><mask_name>"
   [(set (match_operand:VI1_AVX512VL 0 "register_operand" "=v")
@@ -24873,11 +24917,20 @@ (define_insn "avx2_perm<mode>_1<mask_name>"
   mask |= INTVAL (operands[4]) << 4;
   mask |= INTVAL (operands[5]) << 6;
   operands[2] = GEN_INT (mask);
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "<mask_prefix2>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask6_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_expand "avx512f_perm<mode>"
   [(match_operand:V8FI 0 "register_operand")
@@ -24944,11 +24997,20 @@ (define_insn "avx512f_perm<mode>_1<mask_name>"
   mask |= INTVAL (operands[4]) << 4;
   mask |= INTVAL (operands[5]) << 6;
   operands[2] = GEN_INT (mask);
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "<mask_prefix2>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask10_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx2_permv2ti"
   [(set (match_operand:V4DI 0 "register_operand" "=x")
@@ -26843,10 +26905,21 @@ (define_insn "avx512dq_rangep<mode><mask_name><round_saeonly_name>"
 	   (match_operand:SI 3 "const_0_to_15_operand")]
 	  UNSPEC_RANGE))]
   "TARGET_AVX512DQ && <round_saeonly_mode512bit_condition>"
-  "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
+}
   [(set_attr "type" "sse")
    (set_attr "prefix" "evex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask4_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>"
   [(set (match_operand:VF_128 0 "register_operand" "=v")
@@ -26859,10 +26932,21 @@ (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>
 	  (match_dup 1)
 	  (const_int 1)))]
   "TARGET_AVX512DQ"
-  "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+}
   [(set_attr "type" "sse")
    (set_attr "prefix" "evex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask_scalar4_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
@@ -26899,9 +26983,20 @@ (define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>"
 	   (match_operand:SI 2 "const_0_to_15_operand")]
 	  UNSPEC_GETMANT))]
   "TARGET_AVX512F"
-  "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+}
   [(set_attr "prefix" "evex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "!MEM_P (operands[1]) || <mask3_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>"
   [(set (match_operand:VFH_128 0 "register_operand" "=v")
@@ -26914,9 +27009,20 @@ (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name
 	  (match_dup 1)
 	  (const_int 1)))]
    "TARGET_AVX512F"
-   "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+}
    [(set_attr "prefix" "evex")
-   (set_attr "mode" "<ssescalarmode>")])
+   (set_attr "mode" "<ssescalarmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask_scalar4_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 ;; The correct representation for this is absolutely enormous, and
 ;; surely not generally useful.
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 21d445cc46c..802a8715b01 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -71,6 +71,11 @@ (define_subst_attr "bcst_mask_prefix3" "mask" "orig,maybe_evex" "evex,evex")
 (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex")
 (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex")
 (define_subst_attr "mask_expand_op3" "mask" "3" "5")
+(define_subst_attr "mask3_dest_false_dep_attr" "mask" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask4_dest_false_dep_attr" "mask" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask6_dest_false_dep_attr" "mask" "0" "operands[6] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask10_dest_false_dep_attr" "mask" "0" "operands[10] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "maskc_dest_false_dep_attr" "maskc" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
 
 (define_subst "mask"
   [(set (match_operand:SUBST_V 0)
@@ -337,6 +342,8 @@ (define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3")
 (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
 (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
 (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
+(define_subst_attr "mask_scalar4_dest_false_dep_attr" "mask_scalar" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask_scalarc_dest_false_dep_attr" "mask_scalarc" "0" "operands[3] != CONST0_RTX(V8HFmode)")
 
 (define_subst "mask_scalar"
   [(set (match_operand:SUBST_V 0)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 0d3fd078068..1b42c96fc38 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -79,6 +79,11 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
 	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
 	  | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
 
+/* X86_TUNE_DEST_FALSE_DEPENDENCY: This knob inserts zero-idiom before
+   several insns to break false dependency on the dest register.  */
+DEF_TUNE (X86_TUNE_DEST_FALSE_DEPENDENCY,
+	  "dest_false_dependency", m_SAPPHIRERAPIDS | m_ALDERLAKE)
+
 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
    are resolved on SSE register parts instead of whole registers, so we may
    maintain just lower part of scalar values in proper format leaving the
diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
new file mode 100644
index 00000000000..e138920ce18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -mtune-ctrl=dest_false_dependency -O2" } */
+
+
+#include <immintrin.h>
+
+extern __m256i i1, i2, i3, i4;
+extern __m256d d1, d2;
+extern __m256 f1, f2;
+
+void vperm_test (void)
+{
+  i3 = _mm256_permutevar8x32_epi32 (i1, i2);
+  i4 = _mm256_permute4x64_epi64 (i1, 12);
+  d2 = _mm256_permute4x64_pd (d1, 12);
+  f2 = _mm256_permutevar8x32_ps (f1, i2);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 4 } } */
+/* { dg-final { scan-assembler-times "vpermd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermq" 1 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermps" 1 } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
new file mode 100644
index 00000000000..2feb58f2cd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512dq -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m512i i1;
+extern __m256i i2;
+extern __m128i i3;
+extern __m512d d1;
+extern __m256d d2;
+extern __m128d d3;
+extern __m512 f1;
+extern __m256 f2;
+extern __m128 f3;
+
+__mmask32 m32;
+__mmask16 m16;
+__mmask8 m8;
+
+void mullo_test (void)
+{
+  i1 = _mm512_mullo_epi64 (i1, i1);
+  i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1);
+  i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1);
+  i2 = _mm256_mullo_epi64 (i2, i2);
+  i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2);
+  i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2);
+  i3 = _mm_mullo_epi64 (i3, i3);
+  i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3);
+  i3 = _mm_maskz_mullo_epi64 (m8, i3, i3);
+}
+
+void range_test (void)
+{
+  d1 = _mm512_range_pd (d1, d1, 15);
+  d1 = _mm512_range_round_pd (d1, d1, 15, 8);
+  d1 = _mm512_mask_range_pd (d1, m8, d1, d1, 15);
+  d1 = _mm512_mask_range_round_pd (d1, m8, d1, d1, 15, 8);
+  d1 = _mm512_maskz_range_pd (m8, d1, d1, 15);
+  d1 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8);
+  d2 = _mm256_range_pd (d2, d2, 15);
+  d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15);
+  d2 = _mm256_maskz_range_pd (m8, d2, d2, 15);
+  d3 = _mm_range_pd (d3, d3, 15);
+  d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15);
+  d3 = _mm_maskz_range_pd (m8, d3, d3, 15);
+  d3 = _mm_range_sd (d3, d3, 15);
+  d3 = _mm_mask_range_sd (d3, m8, d3, d3, 15);
+  d3 = _mm_maskz_range_sd (m8, d3, d3, 15);
+
+  f1 = _mm512_range_ps (f1, f1, 15);
+  f1 = _mm512_range_round_ps (f1, f1, 15, 8);
+  f1 = _mm512_mask_range_ps (f1, m16, f1, f1, 15);
+  f1 = _mm512_mask_range_round_ps (f1, m16, f1, f1, 15, 8);
+  f1 = _mm512_maskz_range_ps (m16, f1, f1, 15);
+  f1 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8);
+  f2 = _mm256_range_ps (f2, f2, 15);
+  f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15);
+  f2 = _mm256_maskz_range_ps (m8, f2, f2, 15);
+  f3 = _mm_range_ps (f3, f3, 15);
+  f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15);
+  f3 = _mm_maskz_range_ps (m8, f3, f3, 15);
+  f3 = _mm_range_ss (f3, f3, 15);
+  f3 = _mm_mask_range_ss (f3, m8, f3, f3, 15);
+  f3 = _mm_maskz_range_ss (m8, f3, f3, 15);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 26 } } */
+/* { dg-final { scan-assembler-times "vpmullq" 9 } } */
+/* { dg-final { scan-assembler-times "vrangepd" 12 } } */
+/* { dg-final { scan-assembler-times "vrangesd" 3 } } */
+/* { dg-final { scan-assembler-times "vrangeps" 12 } } */
+/* { dg-final { scan-assembler-times "vrangess" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
new file mode 100644
index 00000000000..9650839970e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m512i i1;
+extern __m512d d1, *pd1;
+extern __m128d d2;
+extern __m512 f1, *pf1;
+extern __m128 f2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void vperm_test (void)
+{
+  d1 = _mm512_permutex_pd (d1, 12);
+  d1 = _mm512_mask_permutex_pd (d1, m8, d1, 12);
+  d1 = _mm512_maskz_permutex_pd (m8, d1, 12);
+  d1 = _mm512_permutexvar_pd (i1, d1);
+  d1 = _mm512_mask_permutexvar_pd (d1, m8, i1, d1);
+  d1 = _mm512_maskz_permutexvar_pd (m8, i1, d1);
+
+  f1 = _mm512_permutexvar_ps (i1, f1);
+  f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1);
+  f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1);
+
+  i1 = _mm512_permutexvar_epi64 (i1, i1);
+  i1 = _mm512_mask_permutexvar_epi64 (i1, m8, i1, i1);
+  i1 = _mm512_maskz_permutexvar_epi64 (m8, i1, i1);
+  i1 = _mm512_permutex_epi64 (i1, 12);
+  i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12);
+  i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12);
+
+  i1 = _mm512_permutexvar_epi32 (i1, i1);
+  i1 = _mm512_mask_permutexvar_epi32 (i1, m16, i1, i1);
+  i1 = _mm512_maskz_permutexvar_epi32 (m16, i1, i1);
+} 
+
+void getmant_test (void)
+{
+  d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  d1 = _mm512_getmant_round_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src, 8);
+  d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				     _MM_MANT_SIGN_src, 8);
+  d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				      _MM_MANT_SIGN_src, 8);
+  f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src, 8);
+  f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
+				     _MM_MANT_SIGN_src, 8);
+  f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
+				      _MM_MANT_SIGN_src, 8);
+
+  d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src, 8);
+  d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+				  _MM_MANT_SIGN_src, 8);
+  d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+  d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+				   _MM_MANT_SIGN_src, 8);
+  f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src, 8);
+  f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+				  _MM_MANT_SIGN_src, 8);
+  f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+  f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+				   _MM_MANT_SIGN_src, 8);
+
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 24 } } */
+/* { dg-final { scan-assembler-times "vpermd" 3 } } */
+/* { dg-final { scan-assembler-times "vpermq" 6 } } */
+/* { dg-final { scan-assembler-times "vpermps" 3 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
new file mode 100644
index 00000000000..793bb66201b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m512h h1;
+extern __m256h h2;
+extern __m128h h3;
+
+__mmask32 m32;
+__mmask16 m16;
+__mmask8 m8;
+
+void complex_mul_test (void)
+{
+  h1 = _mm512_fmul_pch (h1, h1);
+  h1 = _mm512_fmul_round_pch (h1, h1, 8);
+  h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1);
+  h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8);
+  h1 = _mm512_maskz_fmul_pch (m32, h1, h1);
+  h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11);
+
+  h3 = _mm_fmul_sch (h3, h3);
+  h3 = _mm_fmul_round_sch (h3, h3, 8);
+  h3 = _mm_mask_fmul_sch (h3, m8, h3, h3);
+  h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8);
+  h3 = _mm_maskz_fmul_sch (m8, h3, h3);
+  h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11);
+}
+
+void vgetmant_test (void)
+{
+  h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+}    
+
+/* { dg-final { scan-assembler-times "vxorps" 10 } } */
+/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
+/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
new file mode 100644
index 00000000000..09658905d2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m256h h1;
+extern __m128h h2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void complex_mul_test (void)
+{
+  h1 = _mm256_fmul_pch (h1, h1);
+  h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1);
+  h1 = _mm256_maskz_fmul_pch (m16, h1, h1);
+  h2 = _mm_fmul_pch (h2, h2);
+  h2 = _mm_mask_fmul_pch (h2, m16, h2, h2);
+  h2 = _mm_maskz_fmul_pch (m16, h2, h2);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 4 } } */
+/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
new file mode 100644
index 00000000000..92717a99837
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
@@ -0,0 +1,76 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+
+#include <immintrin.h>
+
+extern __m256i i1;
+extern __m256d d1, *pd1;
+extern __m128d d2, *pd2;
+extern __m256 f1, *pf1;
+extern __m128 f2, *pf2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void vperm_test (void)
+{
+  d1 = _mm256_permutex_pd (d1, 12);
+  d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12);
+  d1 = _mm256_maskz_permutex_pd (m8, d1, 12);
+  d1 = _mm256_permutexvar_pd (i1, d1);
+  d1 = _mm256_mask_permutexvar_pd (d1, m8, i1, d1);
+  d1 = _mm256_maskz_permutexvar_pd (m8, i1, d1);
+
+  f1 = _mm256_permutexvar_ps (i1, f1);
+  f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1);
+  f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1);
+
+  i1 = _mm256_permutexvar_epi64 (i1, i1);
+  i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1);
+  i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1);
+  i1 = _mm256_permutex_epi64 (i1, 12);
+  i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12);
+  i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12);
+
+  i1 = _mm256_permutexvar_epi32 (i1, i1);
+  i1 = _mm256_mask_permutexvar_epi32 (i1, m8, i1, i1);
+  i1 = _mm256_maskz_permutexvar_epi32 (m8, i1, i1);
+} 
+
+void getmant_test (void)
+{
+  d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+  f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 20 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
+/* { dg-final { scan-assembler-times "vpermps" 3 } } */
+/* { dg-final { scan-assembler-times "vpermq" 6 } } */
+/* { dg-final { scan-assembler-times "vpermd" 3 } } */
+/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
+
-- 
2.18.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-13  7:28 [PATCH] [i386] GLC tuning: Break false dependency for dest register Hongyu Wang
@ 2022-01-13  7:41 ` Uros Bizjak
  2022-01-14  5:38   ` Hongyu Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Uros Bizjak @ 2022-01-13  7:41 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: Hongtao Liu, gcc-patches

On Thu, Jan 13, 2022 at 8:28 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> From: wwwhhhyyy <hongyu.wang@intel.com>
>
> Hi,
>
> For GoldenCove micro-architecture, force insert zero-idiom in asm
> template to break false dependency of dest register for several insns.
>
> The related insns are:
>
> VPERM/D/Q/PS/PD
> VRANGEPD/PS/SD/SS
> VGETMANTSS/SD/SH
> VGETMANDPS/PD - mem version only
> VPMULLQ
> VFMULCSH/PH
> VFCMULCSH/PH
>
> Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}
>
> Ok for master?

No, the approach is wrong. You have to solve output clearing on RTL
level, please look at how e.g. tzcnt false dep is solved:

  [(set (reg:CCC FLAGS_REG)
    (compare:CCC (match_operand:SWI48 1 "nonimmediate_operand" "rm")
             (const_int 0)))
   (set (match_operand:SWI48 0 "register_operand" "=r")
    (ctz:SWI48 (match_dup 1)))]
  "TARGET_BMI"
  "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
   && optimize_function_for_speed_p (cfun)
   && !reg_mentioned_p (operands[0], operands[1])"
  [(parallel
    [(set (reg:CCC FLAGS_REG)
      (compare:CCC (match_dup 1) (const_int 0)))
     (set (match_dup 0)
      (ctz:SWI48 (match_dup 1)))
     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
  "ix86_expand_clear (operands[0]);"
  [(set_attr "type" "alu1")
   (set_attr "prefix_0f" "1")
   (set_attr "prefix_rep" "1")
   (set_attr "btver2_decode" "double")
   (set_attr "mode" "<MODE>")])

For TARGET_AVOID_FALSE_DEP_FOR_BMI, we split at epilogue_complete when
insn registers are stable and use ix86_expand_clear to clear output
operand. Please also note how the final insn is tagged with
UNSPEC_INSN_FALSE_DEP to avoid combine from recognizing it too early.

Uros.

>
> gcc/ChangeLog:
>
>         * config/i386/i386.h (TARGET_DEST_FALSE_DEPENDENCY): New macro.
>         * config/i386/i386.md (dest_false_dep): New define_attr.
>         * config/i386/sse.md (<avx512>_<complexopname>_<mode><maskc_name><round_name>):
>         Insert zero-idiom in output template when attr enabled, set new attribute to
>         true for non-mask/maskz insn.
>         (avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>):
>         Likewise.
>         (avx512dq_mul<mode>3<mask_name>): Likewise.
>         (<avx2_avx512>_permvar<mode><mask_name>): Likewise.
>         (avx2_perm<mode>_1<mask_name>): Likewise.
>         (avx512f_perm<mode>_1<mask_name>): Likewise.
>         (avx512dq_rangep<mode><mask_name><round_saeonly_name>): Likewise.
>         (avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>):
>         Likewise.
>         (<avx512>_getmant<mode><mask_name><round_saeonly_name>): Likewise.
>         (avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
>         Likewise.
>         * config/i386/subst.md (mask3_dest_false_dep_attr): New subst_attr.
>         (mask4_dest_false_dep_attr): Likewise.
>         (mask6_dest_false_dep_attr): Likewise.
>         (mask10_dest_false_dep_attr): Likewise.
>         (maskc_dest_false_dep_attr): Likewise.
>         (mask_scalar4_dest_false_dep_attr): Likewise.
>         (mask_scalarc_dest_false_dep_attr): Likewise.
>         * config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEPENDENCY): New
>         DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx2-dest-false-dependency.c: New test.
>         * gcc.target/i386/avx512dq-dest-false-dependency.c: Ditto.
>         * gcc.target/i386/avx512f-dest-false-dependency.c: Ditto.
>         * gcc.target/i386/avx512fp16-dest-false-dependency.c: Ditto.
>         * gcc.target/i386/avx512fp16vl-dest-false-dependency.c: Ditto.
>         * gcc.target/i386/avx512vl-dest-false-dependency.c: Ditto.
> ---
>  gcc/config/i386/i386.h                        |   2 +
>  gcc/config/i386/i386.md                       |   4 +
>  gcc/config/i386/sse.md                        | 142 +++++++++++++++---
>  gcc/config/i386/subst.md                      |   7 +
>  gcc/config/i386/x86-tune.def                  |   5 +
>  .../i386/avx2-dest-false-dependency.c         |  24 +++
>  .../i386/avx512dq-dest-false-dependency.c     |  73 +++++++++
>  .../i386/avx512f-dest-false-dependency.c      | 102 +++++++++++++
>  .../i386/avx512fp16-dest-false-dependency.c   |  45 ++++++
>  .../i386/avx512fp16vl-dest-false-dependency.c |  24 +++
>  .../i386/avx512vl-dest-false-dependency.c     |  76 ++++++++++
>  11 files changed, 486 insertions(+), 18 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 3ac0f698ae2..ddbf6b9825a 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>         ix86_tune_features[X86_TUNE_EXPAND_ABS]
>  #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \
>         ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
> +#define TARGET_DEST_FALSE_DEPENDENCY \
> +       ix86_tune_features[X86_TUNE_DEST_FALSE_DEPENDENCY]
>
>  /* Feature tests against the various architecture variations.  */
>  enum ix86_arch_indices {
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 9937643a273..40a2b580740 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -823,6 +823,10 @@ (define_attr "i387_cw" "roundeven,floor,ceil,trunc,uninitialized,any"
>  (define_attr "avx_partial_xmm_update" "false,true"
>    (const_string "false"))
>
> +;; Define attribute to indicate complex mult insn with false dependency
> +(define_attr "dest_false_dep" "false,true"
> + (const_string "false"))
> +
>  ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
>  (define_attr "use_carry" "0,1" (const_string "0"))
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 0864748875e..c8dace5b2f8 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -6536,9 +6536,20 @@ (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
>              (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
>              UNSPEC_COMPLEX_F_C_MUL))]
>    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> -  "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
> +}
>    [(set_attr "type" "ssemul")
> -   (set_attr "mode" "<MODE>")])
> +   (set_attr "mode" "<MODE>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<maskc_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>"
>    [(match_operand:V8HF 0 "register_operand")
> @@ -6742,9 +6753,20 @@ (define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarc
>             (match_dup 1)
>             (const_int 3)))]
>    "TARGET_AVX512FP16"
> -  "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
> +}
>    [(set_attr "type" "ssemul")
> -   (set_attr "mode" "V8HF")])
> +   (set_attr "mode" "V8HF")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask_scalarc_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>  ;;
> @@ -15207,10 +15229,21 @@ (define_insn "avx512dq_mul<mode>3<mask_name>"
>           (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))]
>    "TARGET_AVX512DQ && <mask_mode512bit_condition>
>    && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
> -  "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
> +}
>    [(set_attr "type" "sseimul")
>     (set_attr "prefix" "evex")
> -   (set_attr "mode" "<sseinsnmode>")])
> +   (set_attr "mode" "<sseinsnmode>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask3_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_expand "cond_mul<mode>"
>    [(set (match_operand:VI4_AVX512VL 0 "register_operand")
> @@ -24636,10 +24669,21 @@ (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
>            (match_operand:<sseintvecmode> 2 "register_operand" "v")]
>           UNSPEC_VPERMVAR))]
>    "TARGET_AVX2 && <mask_mode512bit_condition>"
> -  "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
> +}
>    [(set_attr "type" "sselog")
>     (set_attr "prefix" "<mask_prefix2>")
> -   (set_attr "mode" "<sseinsnmode>")])
> +   (set_attr "mode" "<sseinsnmode>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask3_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_insn "<avx512>_permvar<mode><mask_name>"
>    [(set (match_operand:VI1_AVX512VL 0 "register_operand" "=v")
> @@ -24873,11 +24917,20 @@ (define_insn "avx2_perm<mode>_1<mask_name>"
>    mask |= INTVAL (operands[4]) << 4;
>    mask |= INTVAL (operands[5]) << 6;
>    operands[2] = GEN_INT (mask);
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
>    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
>  }
>    [(set_attr "type" "sselog")
>     (set_attr "prefix" "<mask_prefix2>")
> -   (set_attr "mode" "<sseinsnmode>")])
> +   (set_attr "mode" "<sseinsnmode>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask6_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_expand "avx512f_perm<mode>"
>    [(match_operand:V8FI 0 "register_operand")
> @@ -24944,11 +24997,20 @@ (define_insn "avx512f_perm<mode>_1<mask_name>"
>    mask |= INTVAL (operands[4]) << 4;
>    mask |= INTVAL (operands[5]) << 6;
>    operands[2] = GEN_INT (mask);
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
>    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
>  }
>    [(set_attr "type" "sselog")
>     (set_attr "prefix" "<mask_prefix2>")
> -   (set_attr "mode" "<sseinsnmode>")])
> +   (set_attr "mode" "<sseinsnmode>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask10_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_insn "avx2_permv2ti"
>    [(set (match_operand:V4DI 0 "register_operand" "=x")
> @@ -26843,10 +26905,21 @@ (define_insn "avx512dq_rangep<mode><mask_name><round_saeonly_name>"
>            (match_operand:SI 3 "const_0_to_15_operand")]
>           UNSPEC_RANGE))]
>    "TARGET_AVX512DQ && <round_saeonly_mode512bit_condition>"
> -  "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}"
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
> +}
>    [(set_attr "type" "sse")
>     (set_attr "prefix" "evex")
> -   (set_attr "mode" "<MODE>")])
> +   (set_attr "mode" "<MODE>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask4_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>"
>    [(set (match_operand:VF_128 0 "register_operand" "=v")
> @@ -26859,10 +26932,21 @@ (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>
>           (match_dup 1)
>           (const_int 1)))]
>    "TARGET_AVX512DQ"
> -  "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> +}
>    [(set_attr "type" "sse")
>     (set_attr "prefix" "evex")
> -   (set_attr "mode" "<MODE>")])
> +   (set_attr "mode" "<MODE>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask_scalar4_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
>    [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> @@ -26899,9 +26983,20 @@ (define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>"
>            (match_operand:SI 2 "const_0_to_15_operand")]
>           UNSPEC_GETMANT))]
>    "TARGET_AVX512F"
> -  "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> +}
>    [(set_attr "prefix" "evex")
> -   (set_attr "mode" "<MODE>")])
> +   (set_attr "mode" "<MODE>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "!MEM_P (operands[1]) || <mask3_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>"
>    [(set (match_operand:VFH_128 0 "register_operand" "=v")
> @@ -26914,9 +27009,20 @@ (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name
>           (match_dup 1)
>           (const_int 1)))]
>     "TARGET_AVX512F"
> -   "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> +}
>     [(set_attr "prefix" "evex")
> -   (set_attr "mode" "<ssescalarmode>")])
> +   (set_attr "mode" "<ssescalarmode>")
> +   (set (attr "dest_false_dep")
> +       (if_then_else
> +         (match_test "<mask_scalar4_dest_false_dep_attr>")
> +         (const_string "false")
> +         (const_string "true")))])
>
>  ;; The correct representation for this is absolutely enormous, and
>  ;; surely not generally useful.
> diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
> index 21d445cc46c..802a8715b01 100644
> --- a/gcc/config/i386/subst.md
> +++ b/gcc/config/i386/subst.md
> @@ -71,6 +71,11 @@ (define_subst_attr "bcst_mask_prefix3" "mask" "orig,maybe_evex" "evex,evex")
>  (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex")
>  (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex")
>  (define_subst_attr "mask_expand_op3" "mask" "3" "5")
> +(define_subst_attr "mask3_dest_false_dep_attr" "mask" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
> +(define_subst_attr "mask4_dest_false_dep_attr" "mask" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
> +(define_subst_attr "mask6_dest_false_dep_attr" "mask" "0" "operands[6] != CONST0_RTX(<MODE>mode)")
> +(define_subst_attr "mask10_dest_false_dep_attr" "mask" "0" "operands[10] != CONST0_RTX(<MODE>mode)")
> +(define_subst_attr "maskc_dest_false_dep_attr" "maskc" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
>
>  (define_subst "mask"
>    [(set (match_operand:SUBST_V 0)
> @@ -337,6 +342,8 @@ (define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3")
>  (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
>  (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
>  (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
> +(define_subst_attr "mask_scalar4_dest_false_dep_attr" "mask_scalar" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
> +(define_subst_attr "mask_scalarc_dest_false_dep_attr" "mask_scalarc" "0" "operands[3] != CONST0_RTX(V8HFmode)")
>
>  (define_subst "mask_scalar"
>    [(set (match_operand:SUBST_V 0)
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 0d3fd078068..1b42c96fc38 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -79,6 +79,11 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
>           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
>           | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
>
> +/* X86_TUNE_DEST_FALSE_DEPENDENCY: This knob inserts zero-idiom before
> +   several insns to break false dependency on the dest register.  */
> +DEF_TUNE (X86_TUNE_DEST_FALSE_DEPENDENCY,
> +         "dest_false_dependency", m_SAPPHIRERAPIDS | m_ALDERLAKE)
> +
>  /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
>     are resolved on SSE register parts instead of whole registers, so we may
>     maintain just lower part of scalar values in proper format leaving the
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> new file mode 100644
> index 00000000000..e138920ce18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -mtune-ctrl=dest_false_dependency -O2" } */
> +
> +
> +#include <immintrin.h>
> +
> +extern __m256i i1, i2, i3, i4;
> +extern __m256d d1, d2;
> +extern __m256 f1, f2;
> +
> +void vperm_test (void)
> +{
> +  i3 = _mm256_permutevar8x32_epi32 (i1, i2);
> +  i4 = _mm256_permute4x64_epi64 (i1, 12);
> +  d2 = _mm256_permute4x64_pd (d1, 12);
> +  f2 = _mm256_permutevar8x32_ps (f1, i2);
> +}
> +
> +/* { dg-final { scan-assembler-times "vxorps" 4 } } */
> +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermq" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermpd" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermps" 1 } } */
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> new file mode 100644
> index 00000000000..2feb58f2cd8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512dq -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> +
> +#include <immintrin.h>
> +
> +extern __m512i i1;
> +extern __m256i i2;
> +extern __m128i i3;
> +extern __m512d d1;
> +extern __m256d d2;
> +extern __m128d d3;
> +extern __m512 f1;
> +extern __m256 f2;
> +extern __m128 f3;
> +
> +__mmask32 m32;
> +__mmask16 m16;
> +__mmask8 m8;
> +
> +void mullo_test (void)
> +{
> +  i1 = _mm512_mullo_epi64 (i1, i1);
> +  i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1);
> +  i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1);
> +  i2 = _mm256_mullo_epi64 (i2, i2);
> +  i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2);
> +  i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2);
> +  i3 = _mm_mullo_epi64 (i3, i3);
> +  i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3);
> +  i3 = _mm_maskz_mullo_epi64 (m8, i3, i3);
> +}
> +
> +void range_test (void)
> +{
> +  d1 = _mm512_range_pd (d1, d1, 15);
> +  d1 = _mm512_range_round_pd (d1, d1, 15, 8);
> +  d1 = _mm512_mask_range_pd (d1, m8, d1, d1, 15);
> +  d1 = _mm512_mask_range_round_pd (d1, m8, d1, d1, 15, 8);
> +  d1 = _mm512_maskz_range_pd (m8, d1, d1, 15);
> +  d1 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8);
> +  d2 = _mm256_range_pd (d2, d2, 15);
> +  d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15);
> +  d2 = _mm256_maskz_range_pd (m8, d2, d2, 15);
> +  d3 = _mm_range_pd (d3, d3, 15);
> +  d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15);
> +  d3 = _mm_maskz_range_pd (m8, d3, d3, 15);
> +  d3 = _mm_range_sd (d3, d3, 15);
> +  d3 = _mm_mask_range_sd (d3, m8, d3, d3, 15);
> +  d3 = _mm_maskz_range_sd (m8, d3, d3, 15);
> +
> +  f1 = _mm512_range_ps (f1, f1, 15);
> +  f1 = _mm512_range_round_ps (f1, f1, 15, 8);
> +  f1 = _mm512_mask_range_ps (f1, m16, f1, f1, 15);
> +  f1 = _mm512_mask_range_round_ps (f1, m16, f1, f1, 15, 8);
> +  f1 = _mm512_maskz_range_ps (m16, f1, f1, 15);
> +  f1 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8);
> +  f2 = _mm256_range_ps (f2, f2, 15);
> +  f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15);
> +  f2 = _mm256_maskz_range_ps (m8, f2, f2, 15);
> +  f3 = _mm_range_ps (f3, f3, 15);
> +  f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15);
> +  f3 = _mm_maskz_range_ps (m8, f3, f3, 15);
> +  f3 = _mm_range_ss (f3, f3, 15);
> +  f3 = _mm_mask_range_ss (f3, m8, f3, f3, 15);
> +  f3 = _mm_maskz_range_ss (m8, f3, f3, 15);
> +}
> +
> +/* { dg-final { scan-assembler-times "vxorps" 26 } } */
> +/* { dg-final { scan-assembler-times "vpmullq" 9 } } */
> +/* { dg-final { scan-assembler-times "vrangepd" 12 } } */
> +/* { dg-final { scan-assembler-times "vrangesd" 3 } } */
> +/* { dg-final { scan-assembler-times "vrangeps" 12 } } */
> +/* { dg-final { scan-assembler-times "vrangess" 3 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> new file mode 100644
> index 00000000000..9650839970e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> @@ -0,0 +1,102 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mtune-ctrl=dest_false_dependency -O2" } */
> +
> +#include <immintrin.h>
> +
> +extern __m512i i1;
> +extern __m512d d1, *pd1;
> +extern __m128d d2;
> +extern __m512 f1, *pf1;
> +extern __m128 f2;
> +
> +__mmask16 m16;
> +__mmask8 m8;
> +
> +void vperm_test (void)
> +{
> +  d1 = _mm512_permutex_pd (d1, 12);
> +  d1 = _mm512_mask_permutex_pd (d1, m8, d1, 12);
> +  d1 = _mm512_maskz_permutex_pd (m8, d1, 12);
> +  d1 = _mm512_permutexvar_pd (i1, d1);
> +  d1 = _mm512_mask_permutexvar_pd (d1, m8, i1, d1);
> +  d1 = _mm512_maskz_permutexvar_pd (m8, i1, d1);
> +
> +  f1 = _mm512_permutexvar_ps (i1, f1);
> +  f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1);
> +  f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1);
> +
> +  i1 = _mm512_permutexvar_epi64 (i1, i1);
> +  i1 = _mm512_mask_permutexvar_epi64 (i1, m8, i1, i1);
> +  i1 = _mm512_maskz_permutexvar_epi64 (m8, i1, i1);
> +  i1 = _mm512_permutex_epi64 (i1, 12);
> +  i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12);
> +  i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12);
> +
> +  i1 = _mm512_permutexvar_epi32 (i1, i1);
> +  i1 = _mm512_mask_permutexvar_epi32 (i1, m16, i1, i1);
> +  i1 = _mm512_maskz_permutexvar_epi32 (m16, i1, i1);
> +}
> +
> +void getmant_test (void)
> +{
> +  d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> +                         _MM_MANT_SIGN_src);
> +  d1 = _mm512_getmant_round_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> +                               _MM_MANT_SIGN_src, 8);
> +  d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> +                              _MM_MANT_SIGN_src);
> +  d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> +                                    _MM_MANT_SIGN_src, 8);
> +  d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> +                               _MM_MANT_SIGN_src);
> +  d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> +                                     _MM_MANT_SIGN_src, 8);
> +  f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> +                         _MM_MANT_SIGN_src);
> +  f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> +                               _MM_MANT_SIGN_src, 8);
> +  f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
> +                              _MM_MANT_SIGN_src);
> +  f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
> +                                    _MM_MANT_SIGN_src, 8);
> +  f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
> +                               _MM_MANT_SIGN_src);
> +  f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
> +                                     _MM_MANT_SIGN_src, 8);
> +
> +  d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
> +                      _MM_MANT_SIGN_src);
> +  d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
> +                            _MM_MANT_SIGN_src, 8);
> +  d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> +                           _MM_MANT_SIGN_src);
> +  d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> +                                 _MM_MANT_SIGN_src, 8);
> +  d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> +                            _MM_MANT_SIGN_src);
> +  d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> +                                  _MM_MANT_SIGN_src, 8);
> +  f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
> +                      _MM_MANT_SIGN_src);
> +  f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
> +                            _MM_MANT_SIGN_src, 8);
> +  f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> +                           _MM_MANT_SIGN_src);
> +  f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> +                                 _MM_MANT_SIGN_src, 8);
> +  f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> +                            _MM_MANT_SIGN_src);
> +  f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> +                                  _MM_MANT_SIGN_src, 8);
> +
> +}
> +
> +/* { dg-final { scan-assembler-times "vxorps" 24 } } */
> +/* { dg-final { scan-assembler-times "vpermd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpermq" 6 } } */
> +/* { dg-final { scan-assembler-times "vpermps" 3 } } */
> +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
> +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
> +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
> +/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */
> +/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> new file mode 100644
> index 00000000000..793bb66201b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> +
> +#include <immintrin.h>
> +
> +extern __m512h h1;
> +extern __m256h h2;
> +extern __m128h h3;
> +
> +__mmask32 m32;
> +__mmask16 m16;
> +__mmask8 m8;
> +
> +void complex_mul_test (void)
> +{
> +  h1 = _mm512_fmul_pch (h1, h1);
> +  h1 = _mm512_fmul_round_pch (h1, h1, 8);
> +  h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1);
> +  h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8);
> +  h1 = _mm512_maskz_fmul_pch (m32, h1, h1);
> +  h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11);
> +
> +  h3 = _mm_fmul_sch (h3, h3);
> +  h3 = _mm_fmul_round_sch (h3, h3, 8);
> +  h3 = _mm_mask_fmul_sch (h3, m8, h3, h3);
> +  h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8);
> +  h3 = _mm_maskz_fmul_sch (m8, h3, h3);
> +  h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11);
> +}
> +
> +void vgetmant_test (void)
> +{
> +  h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5,
> +                      _MM_MANT_SIGN_src);
> +  h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5,
> +                           _MM_MANT_SIGN_src);
> +  h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5,
> +                            _MM_MANT_SIGN_src);
> +}
> +
> +/* { dg-final { scan-assembler-times "vxorps" 10 } } */
> +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
> +/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */
> +/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> new file mode 100644
> index 00000000000..09658905d2d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> +
> +#include <immintrin.h>
> +
> +extern __m256h h1;
> +extern __m128h h2;
> +
> +__mmask16 m16;
> +__mmask8 m8;
> +
> +void complex_mul_test (void)
> +{
> +  h1 = _mm256_fmul_pch (h1, h1);
> +  h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1);
> +  h1 = _mm256_maskz_fmul_pch (m16, h1, h1);
> +  h2 = _mm_fmul_pch (h2, h2);
> +  h2 = _mm_mask_fmul_pch (h2, m16, h2, h2);
> +  h2 = _mm_maskz_fmul_pch (m16, h2, h2);
> +}
> +
> +/* { dg-final { scan-assembler-times "vxorps" 4 } } */
> +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> new file mode 100644
> index 00000000000..92717a99837
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> @@ -0,0 +1,76 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> +
> +
> +#include <immintrin.h>
> +
> +extern __m256i i1;
> +extern __m256d d1, *pd1;
> +extern __m128d d2, *pd2;
> +extern __m256 f1, *pf1;
> +extern __m128 f2, *pf2;
> +
> +__mmask16 m16;
> +__mmask8 m8;
> +
> +void vperm_test (void)
> +{
> +  d1 = _mm256_permutex_pd (d1, 12);
> +  d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12);
> +  d1 = _mm256_maskz_permutex_pd (m8, d1, 12);
> +  d1 = _mm256_permutexvar_pd (i1, d1);
> +  d1 = _mm256_mask_permutexvar_pd (d1, m8, i1, d1);
> +  d1 = _mm256_maskz_permutexvar_pd (m8, i1, d1);
> +
> +  f1 = _mm256_permutexvar_ps (i1, f1);
> +  f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1);
> +  f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1);
> +
> +  i1 = _mm256_permutexvar_epi64 (i1, i1);
> +  i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1);
> +  i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1);
> +  i1 = _mm256_permutex_epi64 (i1, 12);
> +  i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12);
> +  i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12);
> +
> +  i1 = _mm256_permutexvar_epi32 (i1, i1);
> +  i1 = _mm256_mask_permutexvar_epi32 (i1, m8, i1, i1);
> +  i1 = _mm256_maskz_permutexvar_epi32 (m8, i1, i1);
> +}
> +
> +void getmant_test (void)
> +{
> +  d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> +                         _MM_MANT_SIGN_src);
> +  d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> +                              _MM_MANT_SIGN_src);
> +  d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> +                               _MM_MANT_SIGN_src);
> +  d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5,
> +                      _MM_MANT_SIGN_src);
> +  d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5,
> +                           _MM_MANT_SIGN_src);
> +  d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5,
> +                            _MM_MANT_SIGN_src);
> +  f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> +                         _MM_MANT_SIGN_src);
> +  f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5,
> +                              _MM_MANT_SIGN_src);
> +  f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5,
> +                               _MM_MANT_SIGN_src);
> +  f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5,
> +                      _MM_MANT_SIGN_src);
> +  f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5,
> +                           _MM_MANT_SIGN_src);
> +  f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5,
> +                            _MM_MANT_SIGN_src);
> +}
> +
> +/* { dg-final { scan-assembler-times "vxorps" 20 } } */
> +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
> +/* { dg-final { scan-assembler-times "vpermps" 3 } } */
> +/* { dg-final { scan-assembler-times "vpermq" 6 } } */
> +/* { dg-final { scan-assembler-times "vpermd" 3 } } */
> +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
> +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
> +
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-13  7:41 ` Uros Bizjak
@ 2022-01-14  5:38   ` Hongyu Wang
  2022-01-14  6:03     ` Hongyu Wang
  2022-01-14  8:17     ` [PATCH] [i386] GLC tuning: Break false dependency for dest register Uros Bizjak
  0 siblings, 2 replies; 16+ messages in thread
From: Hongyu Wang @ 2022-01-14  5:38 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

> No, the approach is wrong. You have to solve output clearing on RTL
> level, please look at how e.g. tzcnt false dep is solved:

Actually we have considered such approach before, but we found we need
to break original define_insn to remove the mask/rounding subst,
since define_split could not adopt subst, and that would add 6 more
define_insn_and_split and 4 define_insn for each instruction. We think
such approach would introduce too much redundant code.

Do you think the code size increment is acceptable?

Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年1月13日周四 15:42写道:
>
> On Thu, Jan 13, 2022 at 8:28 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
> >
> > From: wwwhhhyyy <hongyu.wang@intel.com>
> >
> > Hi,
> >
> > For GoldenCove micro-architecture, force insert zero-idiom in asm
> > template to break false dependency of dest register for several insns.
> >
> > The related insns are:
> >
> > VPERM/D/Q/PS/PD
> > VRANGEPD/PS/SD/SS
> > VGETMANTSS/SD/SH
> > VGETMANDPS/PD - mem version only
> > VPMULLQ
> > VFMULCSH/PH
> > VFCMULCSH/PH
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}
> >
> > Ok for master?
>
> No, the approach is wrong. You have to solve output clearing on RTL
> level, please look at how e.g. tzcnt false dep is solved:
>
>   [(set (reg:CCC FLAGS_REG)
>     (compare:CCC (match_operand:SWI48 1 "nonimmediate_operand" "rm")
>              (const_int 0)))
>    (set (match_operand:SWI48 0 "register_operand" "=r")
>     (ctz:SWI48 (match_dup 1)))]
>   "TARGET_BMI"
>   "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
>   "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
>    && optimize_function_for_speed_p (cfun)
>    && !reg_mentioned_p (operands[0], operands[1])"
>   [(parallel
>     [(set (reg:CCC FLAGS_REG)
>       (compare:CCC (match_dup 1) (const_int 0)))
>      (set (match_dup 0)
>       (ctz:SWI48 (match_dup 1)))
>      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
>   "ix86_expand_clear (operands[0]);"
>   [(set_attr "type" "alu1")
>    (set_attr "prefix_0f" "1")
>    (set_attr "prefix_rep" "1")
>    (set_attr "btver2_decode" "double")
>    (set_attr "mode" "<MODE>")])
>
> For TARGET_AVOID_FALSE_DEP_FOR_BMI, we split at epilogue_complete when
> insn registers are stable and use ix86_expand_clear to clear output
> operand. Please also note how the final insn is tagged with
> UNSPEC_INSN_FALSE_DEP to avoid combine from recognizing it too early.
>
> Uros.
>
> >
> > gcc/ChangeLog:
> >
> >         * config/i386/i386.h (TARGET_DEST_FALSE_DEPENDENCY): New macro.
> >         * config/i386/i386.md (dest_false_dep): New define_attr.
> >         * config/i386/sse.md (<avx512>_<complexopname>_<mode><maskc_name><round_name>):
> >         Insert zero-idiom in output template when attr enabled, set new attribute to
> >         true for non-mask/maskz insn.
> >         (avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>):
> >         Likewise.
> >         (avx512dq_mul<mode>3<mask_name>): Likewise.
> >         (<avx2_avx512>_permvar<mode><mask_name>): Likewise.
> >         (avx2_perm<mode>_1<mask_name>): Likewise.
> >         (avx512f_perm<mode>_1<mask_name>): Likewise.
> >         (avx512dq_rangep<mode><mask_name><round_saeonly_name>): Likewise.
> >         (avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>):
> >         Likewise.
> >         (<avx512>_getmant<mode><mask_name><round_saeonly_name>): Likewise.
> >         (avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
> >         Likewise.
> >         * config/i386/subst.md (mask3_dest_false_dep_attr): New subst_attr.
> >         (mask4_dest_false_dep_attr): Likewise.
> >         (mask6_dest_false_dep_attr): Likewise.
> >         (mask10_dest_false_dep_attr): Likewise.
> >         (maskc_dest_false_dep_attr): Likewise.
> >         (mask_scalar4_dest_false_dep_attr): Likewise.
> >         (mask_scalarc_dest_false_dep_attr): Likewise.
> >         * config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEPENDENCY): New
> >         DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/avx2-dest-false-dependency.c: New test.
> >         * gcc.target/i386/avx512dq-dest-false-dependency.c: Ditto.
> >         * gcc.target/i386/avx512f-dest-false-dependency.c: Ditto.
> >         * gcc.target/i386/avx512fp16-dest-false-dependency.c: Ditto.
> >         * gcc.target/i386/avx512fp16vl-dest-false-dependency.c: Ditto.
> >         * gcc.target/i386/avx512vl-dest-false-dependency.c: Ditto.
> > ---
> >  gcc/config/i386/i386.h                        |   2 +
> >  gcc/config/i386/i386.md                       |   4 +
> >  gcc/config/i386/sse.md                        | 142 +++++++++++++++---
> >  gcc/config/i386/subst.md                      |   7 +
> >  gcc/config/i386/x86-tune.def                  |   5 +
> >  .../i386/avx2-dest-false-dependency.c         |  24 +++
> >  .../i386/avx512dq-dest-false-dependency.c     |  73 +++++++++
> >  .../i386/avx512f-dest-false-dependency.c      | 102 +++++++++++++
> >  .../i386/avx512fp16-dest-false-dependency.c   |  45 ++++++
> >  .../i386/avx512fp16vl-dest-false-dependency.c |  24 +++
> >  .../i386/avx512vl-dest-false-dependency.c     |  76 ++++++++++
> >  11 files changed, 486 insertions(+), 18 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> >
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 3ac0f698ae2..ddbf6b9825a 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> >         ix86_tune_features[X86_TUNE_EXPAND_ABS]
> >  #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \
> >         ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
> > +#define TARGET_DEST_FALSE_DEPENDENCY \
> > +       ix86_tune_features[X86_TUNE_DEST_FALSE_DEPENDENCY]
> >
> >  /* Feature tests against the various architecture variations.  */
> >  enum ix86_arch_indices {
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index 9937643a273..40a2b580740 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -823,6 +823,10 @@ (define_attr "i387_cw" "roundeven,floor,ceil,trunc,uninitialized,any"
> >  (define_attr "avx_partial_xmm_update" "false,true"
> >    (const_string "false"))
> >
> > +;; Define attribute to indicate complex mult insn with false dependency
> > +(define_attr "dest_false_dep" "false,true"
> > + (const_string "false"))
> > +
> >  ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
> >  (define_attr "use_carry" "0,1" (const_string "0"))
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 0864748875e..c8dace5b2f8 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -6536,9 +6536,20 @@ (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
> >              (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
> >              UNSPEC_COMPLEX_F_C_MUL))]
> >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> > -  "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
> > +}
> >    [(set_attr "type" "ssemul")
> > -   (set_attr "mode" "<MODE>")])
> > +   (set_attr "mode" "<MODE>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<maskc_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>"
> >    [(match_operand:V8HF 0 "register_operand")
> > @@ -6742,9 +6753,20 @@ (define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarc
> >             (match_dup 1)
> >             (const_int 3)))]
> >    "TARGET_AVX512FP16"
> > -  "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
> > +}
> >    [(set_attr "type" "ssemul")
> > -   (set_attr "mode" "V8HF")])
> > +   (set_attr "mode" "V8HF")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask_scalarc_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >  ;;
> > @@ -15207,10 +15229,21 @@ (define_insn "avx512dq_mul<mode>3<mask_name>"
> >           (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))]
> >    "TARGET_AVX512DQ && <mask_mode512bit_condition>
> >    && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
> > -  "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
> > +}
> >    [(set_attr "type" "sseimul")
> >     (set_attr "prefix" "evex")
> > -   (set_attr "mode" "<sseinsnmode>")])
> > +   (set_attr "mode" "<sseinsnmode>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask3_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_expand "cond_mul<mode>"
> >    [(set (match_operand:VI4_AVX512VL 0 "register_operand")
> > @@ -24636,10 +24669,21 @@ (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
> >            (match_operand:<sseintvecmode> 2 "register_operand" "v")]
> >           UNSPEC_VPERMVAR))]
> >    "TARGET_AVX2 && <mask_mode512bit_condition>"
> > -  "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
> > +}
> >    [(set_attr "type" "sselog")
> >     (set_attr "prefix" "<mask_prefix2>")
> > -   (set_attr "mode" "<sseinsnmode>")])
> > +   (set_attr "mode" "<sseinsnmode>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask3_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_insn "<avx512>_permvar<mode><mask_name>"
> >    [(set (match_operand:VI1_AVX512VL 0 "register_operand" "=v")
> > @@ -24873,11 +24917,20 @@ (define_insn "avx2_perm<mode>_1<mask_name>"
> >    mask |= INTVAL (operands[4]) << 4;
> >    mask |= INTVAL (operands[5]) << 6;
> >    operands[2] = GEN_INT (mask);
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> >    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
> >  }
> >    [(set_attr "type" "sselog")
> >     (set_attr "prefix" "<mask_prefix2>")
> > -   (set_attr "mode" "<sseinsnmode>")])
> > +   (set_attr "mode" "<sseinsnmode>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask6_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_expand "avx512f_perm<mode>"
> >    [(match_operand:V8FI 0 "register_operand")
> > @@ -24944,11 +24997,20 @@ (define_insn "avx512f_perm<mode>_1<mask_name>"
> >    mask |= INTVAL (operands[4]) << 4;
> >    mask |= INTVAL (operands[5]) << 6;
> >    operands[2] = GEN_INT (mask);
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> >    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
> >  }
> >    [(set_attr "type" "sselog")
> >     (set_attr "prefix" "<mask_prefix2>")
> > -   (set_attr "mode" "<sseinsnmode>")])
> > +   (set_attr "mode" "<sseinsnmode>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask10_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_insn "avx2_permv2ti"
> >    [(set (match_operand:V4DI 0 "register_operand" "=x")
> > @@ -26843,10 +26905,21 @@ (define_insn "avx512dq_rangep<mode><mask_name><round_saeonly_name>"
> >            (match_operand:SI 3 "const_0_to_15_operand")]
> >           UNSPEC_RANGE))]
> >    "TARGET_AVX512DQ && <round_saeonly_mode512bit_condition>"
> > -  "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}"
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
> > +}
> >    [(set_attr "type" "sse")
> >     (set_attr "prefix" "evex")
> > -   (set_attr "mode" "<MODE>")])
> > +   (set_attr "mode" "<MODE>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask4_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>"
> >    [(set (match_operand:VF_128 0 "register_operand" "=v")
> > @@ -26859,10 +26932,21 @@ (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>
> >           (match_dup 1)
> >           (const_int 1)))]
> >    "TARGET_AVX512DQ"
> > -  "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> > +}
> >    [(set_attr "type" "sse")
> >     (set_attr "prefix" "evex")
> > -   (set_attr "mode" "<MODE>")])
> > +   (set_attr "mode" "<MODE>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask_scalar4_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
> >    [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> > @@ -26899,9 +26983,20 @@ (define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>"
> >            (match_operand:SI 2 "const_0_to_15_operand")]
> >           UNSPEC_GETMANT))]
> >    "TARGET_AVX512F"
> > -  "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> > +}
> >    [(set_attr "prefix" "evex")
> > -   (set_attr "mode" "<MODE>")])
> > +   (set_attr "mode" "<MODE>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "!MEM_P (operands[1]) || <mask3_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>"
> >    [(set (match_operand:VFH_128 0 "register_operand" "=v")
> > @@ -26914,9 +27009,20 @@ (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name
> >           (match_dup 1)
> >           (const_int 1)))]
> >     "TARGET_AVX512F"
> > -   "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> > +}
> >     [(set_attr "prefix" "evex")
> > -   (set_attr "mode" "<ssescalarmode>")])
> > +   (set_attr "mode" "<ssescalarmode>")
> > +   (set (attr "dest_false_dep")
> > +       (if_then_else
> > +         (match_test "<mask_scalar4_dest_false_dep_attr>")
> > +         (const_string "false")
> > +         (const_string "true")))])
> >
> >  ;; The correct representation for this is absolutely enormous, and
> >  ;; surely not generally useful.
> > diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
> > index 21d445cc46c..802a8715b01 100644
> > --- a/gcc/config/i386/subst.md
> > +++ b/gcc/config/i386/subst.md
> > @@ -71,6 +71,11 @@ (define_subst_attr "bcst_mask_prefix3" "mask" "orig,maybe_evex" "evex,evex")
> >  (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex")
> >  (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex")
> >  (define_subst_attr "mask_expand_op3" "mask" "3" "5")
> > +(define_subst_attr "mask3_dest_false_dep_attr" "mask" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
> > +(define_subst_attr "mask4_dest_false_dep_attr" "mask" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
> > +(define_subst_attr "mask6_dest_false_dep_attr" "mask" "0" "operands[6] != CONST0_RTX(<MODE>mode)")
> > +(define_subst_attr "mask10_dest_false_dep_attr" "mask" "0" "operands[10] != CONST0_RTX(<MODE>mode)")
> > +(define_subst_attr "maskc_dest_false_dep_attr" "maskc" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
> >
> >  (define_subst "mask"
> >    [(set (match_operand:SUBST_V 0)
> > @@ -337,6 +342,8 @@ (define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3")
> >  (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
> >  (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
> >  (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
> > +(define_subst_attr "mask_scalar4_dest_false_dep_attr" "mask_scalar" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
> > +(define_subst_attr "mask_scalarc_dest_false_dep_attr" "mask_scalarc" "0" "operands[3] != CONST0_RTX(V8HFmode)")
> >
> >  (define_subst "mask_scalar"
> >    [(set (match_operand:SUBST_V 0)
> > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> > index 0d3fd078068..1b42c96fc38 100644
> > --- a/gcc/config/i386/x86-tune.def
> > +++ b/gcc/config/i386/x86-tune.def
> > @@ -79,6 +79,11 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
> >           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
> >           | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
> >
> > +/* X86_TUNE_DEST_FALSE_DEPENDENCY: This knob inserts zero-idiom before
> > +   several insns to break false dependency on the dest register.  */
> > +DEF_TUNE (X86_TUNE_DEST_FALSE_DEPENDENCY,
> > +         "dest_false_dependency", m_SAPPHIRERAPIDS | m_ALDERLAKE)
> > +
> >  /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
> >     are resolved on SSE register parts instead of whole registers, so we may
> >     maintain just lower part of scalar values in proper format leaving the
> > diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> > new file mode 100644
> > index 00000000000..e138920ce18
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx2 -mtune-ctrl=dest_false_dependency -O2" } */
> > +
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m256i i1, i2, i3, i4;
> > +extern __m256d d1, d2;
> > +extern __m256 f1, f2;
> > +
> > +void vperm_test (void)
> > +{
> > +  i3 = _mm256_permutevar8x32_epi32 (i1, i2);
> > +  i4 = _mm256_permute4x64_epi64 (i1, 12);
> > +  d2 = _mm256_permute4x64_pd (d1, 12);
> > +  f2 = _mm256_permutevar8x32_ps (f1, i2);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "vxorps" 4 } } */
> > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpermq" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpermpd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpermps" 1 } } */
> > +
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> > new file mode 100644
> > index 00000000000..2feb58f2cd8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> > @@ -0,0 +1,73 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512dq -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m512i i1;
> > +extern __m256i i2;
> > +extern __m128i i3;
> > +extern __m512d d1;
> > +extern __m256d d2;
> > +extern __m128d d3;
> > +extern __m512 f1;
> > +extern __m256 f2;
> > +extern __m128 f3;
> > +
> > +__mmask32 m32;
> > +__mmask16 m16;
> > +__mmask8 m8;
> > +
> > +void mullo_test (void)
> > +{
> > +  i1 = _mm512_mullo_epi64 (i1, i1);
> > +  i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1);
> > +  i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1);
> > +  i2 = _mm256_mullo_epi64 (i2, i2);
> > +  i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2);
> > +  i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2);
> > +  i3 = _mm_mullo_epi64 (i3, i3);
> > +  i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3);
> > +  i3 = _mm_maskz_mullo_epi64 (m8, i3, i3);
> > +}
> > +
> > +void range_test (void)
> > +{
> > +  d1 = _mm512_range_pd (d1, d1, 15);
> > +  d1 = _mm512_range_round_pd (d1, d1, 15, 8);
> > +  d1 = _mm512_mask_range_pd (d1, m8, d1, d1, 15);
> > +  d1 = _mm512_mask_range_round_pd (d1, m8, d1, d1, 15, 8);
> > +  d1 = _mm512_maskz_range_pd (m8, d1, d1, 15);
> > +  d1 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8);
> > +  d2 = _mm256_range_pd (d2, d2, 15);
> > +  d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15);
> > +  d2 = _mm256_maskz_range_pd (m8, d2, d2, 15);
> > +  d3 = _mm_range_pd (d3, d3, 15);
> > +  d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15);
> > +  d3 = _mm_maskz_range_pd (m8, d3, d3, 15);
> > +  d3 = _mm_range_sd (d3, d3, 15);
> > +  d3 = _mm_mask_range_sd (d3, m8, d3, d3, 15);
> > +  d3 = _mm_maskz_range_sd (m8, d3, d3, 15);
> > +
> > +  f1 = _mm512_range_ps (f1, f1, 15);
> > +  f1 = _mm512_range_round_ps (f1, f1, 15, 8);
> > +  f1 = _mm512_mask_range_ps (f1, m16, f1, f1, 15);
> > +  f1 = _mm512_mask_range_round_ps (f1, m16, f1, f1, 15, 8);
> > +  f1 = _mm512_maskz_range_ps (m16, f1, f1, 15);
> > +  f1 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8);
> > +  f2 = _mm256_range_ps (f2, f2, 15);
> > +  f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15);
> > +  f2 = _mm256_maskz_range_ps (m8, f2, f2, 15);
> > +  f3 = _mm_range_ps (f3, f3, 15);
> > +  f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15);
> > +  f3 = _mm_maskz_range_ps (m8, f3, f3, 15);
> > +  f3 = _mm_range_ss (f3, f3, 15);
> > +  f3 = _mm_mask_range_ss (f3, m8, f3, f3, 15);
> > +  f3 = _mm_maskz_range_ss (m8, f3, f3, 15);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "vxorps" 26 } } */
> > +/* { dg-final { scan-assembler-times "vpmullq" 9 } } */
> > +/* { dg-final { scan-assembler-times "vrangepd" 12 } } */
> > +/* { dg-final { scan-assembler-times "vrangesd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vrangeps" 12 } } */
> > +/* { dg-final { scan-assembler-times "vrangess" 3 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> > new file mode 100644
> > index 00000000000..9650839970e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> > @@ -0,0 +1,102 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -mtune-ctrl=dest_false_dependency -O2" } */
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m512i i1;
> > +extern __m512d d1, *pd1;
> > +extern __m128d d2;
> > +extern __m512 f1, *pf1;
> > +extern __m128 f2;
> > +
> > +__mmask16 m16;
> > +__mmask8 m8;
> > +
> > +void vperm_test (void)
> > +{
> > +  d1 = _mm512_permutex_pd (d1, 12);
> > +  d1 = _mm512_mask_permutex_pd (d1, m8, d1, 12);
> > +  d1 = _mm512_maskz_permutex_pd (m8, d1, 12);
> > +  d1 = _mm512_permutexvar_pd (i1, d1);
> > +  d1 = _mm512_mask_permutexvar_pd (d1, m8, i1, d1);
> > +  d1 = _mm512_maskz_permutexvar_pd (m8, i1, d1);
> > +
> > +  f1 = _mm512_permutexvar_ps (i1, f1);
> > +  f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1);
> > +  f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1);
> > +
> > +  i1 = _mm512_permutexvar_epi64 (i1, i1);
> > +  i1 = _mm512_mask_permutexvar_epi64 (i1, m8, i1, i1);
> > +  i1 = _mm512_maskz_permutexvar_epi64 (m8, i1, i1);
> > +  i1 = _mm512_permutex_epi64 (i1, 12);
> > +  i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12);
> > +  i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12);
> > +
> > +  i1 = _mm512_permutexvar_epi32 (i1, i1);
> > +  i1 = _mm512_mask_permutexvar_epi32 (i1, m16, i1, i1);
> > +  i1 = _mm512_maskz_permutexvar_epi32 (m16, i1, i1);
> > +}
> > +
> > +void getmant_test (void)
> > +{
> > +  d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> > +                         _MM_MANT_SIGN_src);
> > +  d1 = _mm512_getmant_round_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> > +                               _MM_MANT_SIGN_src, 8);
> > +  d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > +                              _MM_MANT_SIGN_src);
> > +  d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > +                                    _MM_MANT_SIGN_src, 8);
> > +  d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > +                               _MM_MANT_SIGN_src);
> > +  d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > +                                     _MM_MANT_SIGN_src, 8);
> > +  f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> > +                         _MM_MANT_SIGN_src);
> > +  f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> > +                               _MM_MANT_SIGN_src, 8);
> > +  f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > +                              _MM_MANT_SIGN_src);
> > +  f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > +                                    _MM_MANT_SIGN_src, 8);
> > +  f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > +                               _MM_MANT_SIGN_src);
> > +  f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > +                                     _MM_MANT_SIGN_src, 8);
> > +
> > +  d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
> > +                      _MM_MANT_SIGN_src);
> > +  d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
> > +                            _MM_MANT_SIGN_src, 8);
> > +  d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > +                           _MM_MANT_SIGN_src);
> > +  d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > +                                 _MM_MANT_SIGN_src, 8);
> > +  d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > +                            _MM_MANT_SIGN_src);
> > +  d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > +                                  _MM_MANT_SIGN_src, 8);
> > +  f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
> > +                      _MM_MANT_SIGN_src);
> > +  f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
> > +                            _MM_MANT_SIGN_src, 8);
> > +  f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > +                           _MM_MANT_SIGN_src);
> > +  f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > +                                 _MM_MANT_SIGN_src, 8);
> > +  f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > +                            _MM_MANT_SIGN_src);
> > +  f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > +                                  _MM_MANT_SIGN_src, 8);
> > +
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "vxorps" 24 } } */
> > +/* { dg-final { scan-assembler-times "vpermd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpermq" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpermps" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
> > +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
> > +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
> > +/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */
> > +/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> > new file mode 100644
> > index 00000000000..793bb66201b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> > @@ -0,0 +1,45 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m512h h1;
> > +extern __m256h h2;
> > +extern __m128h h3;
> > +
> > +__mmask32 m32;
> > +__mmask16 m16;
> > +__mmask8 m8;
> > +
> > +void complex_mul_test (void)
> > +{
> > +  h1 = _mm512_fmul_pch (h1, h1);
> > +  h1 = _mm512_fmul_round_pch (h1, h1, 8);
> > +  h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1);
> > +  h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8);
> > +  h1 = _mm512_maskz_fmul_pch (m32, h1, h1);
> > +  h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11);
> > +
> > +  h3 = _mm_fmul_sch (h3, h3);
> > +  h3 = _mm_fmul_round_sch (h3, h3, 8);
> > +  h3 = _mm_mask_fmul_sch (h3, m8, h3, h3);
> > +  h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8);
> > +  h3 = _mm_maskz_fmul_sch (m8, h3, h3);
> > +  h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11);
> > +}
> > +
> > +void vgetmant_test (void)
> > +{
> > +  h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5,
> > +                      _MM_MANT_SIGN_src);
> > +  h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5,
> > +                           _MM_MANT_SIGN_src);
> > +  h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5,
> > +                            _MM_MANT_SIGN_src);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "vxorps" 10 } } */
> > +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
> > +/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */
> > +/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */
> > +
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> > new file mode 100644
> > index 00000000000..09658905d2d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m256h h1;
> > +extern __m128h h2;
> > +
> > +__mmask16 m16;
> > +__mmask8 m8;
> > +
> > +void complex_mul_test (void)
> > +{
> > +  h1 = _mm256_fmul_pch (h1, h1);
> > +  h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1);
> > +  h1 = _mm256_maskz_fmul_pch (m16, h1, h1);
> > +  h2 = _mm_fmul_pch (h2, h2);
> > +  h2 = _mm_mask_fmul_pch (h2, m16, h2, h2);
> > +  h2 = _mm_maskz_fmul_pch (m16, h2, h2);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "vxorps" 4 } } */
> > +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
> > +
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> > new file mode 100644
> > index 00000000000..92717a99837
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> > @@ -0,0 +1,76 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > +
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m256i i1;
> > +extern __m256d d1, *pd1;
> > +extern __m128d d2, *pd2;
> > +extern __m256 f1, *pf1;
> > +extern __m128 f2, *pf2;
> > +
> > +__mmask16 m16;
> > +__mmask8 m8;
> > +
> > +void vperm_test (void)
> > +{
> > +  d1 = _mm256_permutex_pd (d1, 12);
> > +  d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12);
> > +  d1 = _mm256_maskz_permutex_pd (m8, d1, 12);
> > +  d1 = _mm256_permutexvar_pd (i1, d1);
> > +  d1 = _mm256_mask_permutexvar_pd (d1, m8, i1, d1);
> > +  d1 = _mm256_maskz_permutexvar_pd (m8, i1, d1);
> > +
> > +  f1 = _mm256_permutexvar_ps (i1, f1);
> > +  f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1);
> > +  f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1);
> > +
> > +  i1 = _mm256_permutexvar_epi64 (i1, i1);
> > +  i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1);
> > +  i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1);
> > +  i1 = _mm256_permutex_epi64 (i1, 12);
> > +  i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12);
> > +  i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12);
> > +
> > +  i1 = _mm256_permutexvar_epi32 (i1, i1);
> > +  i1 = _mm256_mask_permutexvar_epi32 (i1, m8, i1, i1);
> > +  i1 = _mm256_maskz_permutexvar_epi32 (m8, i1, i1);
> > +}
> > +
> > +void getmant_test (void)
> > +{
> > +  d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> > +                         _MM_MANT_SIGN_src);
> > +  d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > +                              _MM_MANT_SIGN_src);
> > +  d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > +                               _MM_MANT_SIGN_src);
> > +  d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5,
> > +                      _MM_MANT_SIGN_src);
> > +  d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5,
> > +                           _MM_MANT_SIGN_src);
> > +  d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5,
> > +                            _MM_MANT_SIGN_src);
> > +  f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> > +                         _MM_MANT_SIGN_src);
> > +  f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5,
> > +                              _MM_MANT_SIGN_src);
> > +  f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5,
> > +                               _MM_MANT_SIGN_src);
> > +  f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5,
> > +                      _MM_MANT_SIGN_src);
> > +  f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5,
> > +                           _MM_MANT_SIGN_src);
> > +  f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5,
> > +                            _MM_MANT_SIGN_src);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "vxorps" 20 } } */
> > +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpermps" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpermq" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpermd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
> > +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
> > +
> > --
> > 2.18.1
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-14  5:38   ` Hongyu Wang
@ 2022-01-14  6:03     ` Hongyu Wang
  2022-01-14  8:37       ` Uros Bizjak
  2022-01-14  8:17     ` [PATCH] [i386] GLC tuning: Break false dependency for dest register Uros Bizjak
  1 sibling, 1 reply; 16+ messages in thread
From: Hongyu Wang @ 2022-01-14  6:03 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

> > No, the approach is wrong. You have to solve output clearing on RTL
> > level, please look at how e.g. tzcnt false dep is solved:
>
> Actually we have considered such approach before, but we found we need
> to break original define_insn to remove the mask/rounding subst,
> since define_split could not adopt subst, and that would add 6 more
> define_insn_and_split and 4 define_insn for each instruction. We think
> such approach would introduce too much redundant code.
>
> Do you think the code size increment is acceptable?

Also that 100+ more patterns increases maintenance effort. If we split
them at epilogue_complete stage,
it seems not much difference to put it under output template...

Hongyu Wang <wwwhhhyyy333@gmail.com> 于2022年1月14日周五 13:38写道:
>
> > No, the approach is wrong. You have to solve output clearing on RTL
> > level, please look at how e.g. tzcnt false dep is solved:
>
> Actually we have considered such approach before, but we found we need
> to break original define_insn to remove the mask/rounding subst,
> since define_split could not adopt subst, and that would add 6 more
> define_insn_and_split and 4 define_insn for each instruction. We think
> such approach would introduce too much redundant code.
>
> Do you think the code size increment is acceptable?
>
> Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年1月13日周四 15:42写道:
> >
> > On Thu, Jan 13, 2022 at 8:28 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
> > >
> > > From: wwwhhhyyy <hongyu.wang@intel.com>
> > >
> > > Hi,
> > >
> > > For GoldenCove micro-architecture, force insert zero-idiom in asm
> > > template to break false dependency of dest register for several insns.
> > >
> > > The related insns are:
> > >
> > > VPERM/D/Q/PS/PD
> > > VRANGEPD/PS/SD/SS
> > > VGETMANTSS/SD/SH
> > > VGETMANDPS/PD - mem version only
> > > VPMULLQ
> > > VFMULCSH/PH
> > > VFCMULCSH/PH
> > >
> > > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}
> > >
> > > Ok for master?
> >
> > No, the approach is wrong. You have to solve output clearing on RTL
> > level, please look at how e.g. tzcnt false dep is solved:
> >
> >   [(set (reg:CCC FLAGS_REG)
> >     (compare:CCC (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> >              (const_int 0)))
> >    (set (match_operand:SWI48 0 "register_operand" "=r")
> >     (ctz:SWI48 (match_dup 1)))]
> >   "TARGET_BMI"
> >   "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
> >   "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
> >    && optimize_function_for_speed_p (cfun)
> >    && !reg_mentioned_p (operands[0], operands[1])"
> >   [(parallel
> >     [(set (reg:CCC FLAGS_REG)
> >       (compare:CCC (match_dup 1) (const_int 0)))
> >      (set (match_dup 0)
> >       (ctz:SWI48 (match_dup 1)))
> >      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> >   "ix86_expand_clear (operands[0]);"
> >   [(set_attr "type" "alu1")
> >    (set_attr "prefix_0f" "1")
> >    (set_attr "prefix_rep" "1")
> >    (set_attr "btver2_decode" "double")
> >    (set_attr "mode" "<MODE>")])
> >
> > For TARGET_AVOID_FALSE_DEP_FOR_BMI, we split at epilogue_complete when
> > insn registers are stable and use ix86_expand_clear to clear output
> > operand. Please also note how the final insn is tagged with
> > UNSPEC_INSN_FALSE_DEP to avoid combine from recognizing it too early.
> >
> > Uros.
> >
> > >
> > > gcc/ChangeLog:
> > >
> > >         * config/i386/i386.h (TARGET_DEST_FALSE_DEPENDENCY): New macro.
> > >         * config/i386/i386.md (dest_false_dep): New define_attr.
> > >         * config/i386/sse.md (<avx512>_<complexopname>_<mode><maskc_name><round_name>):
> > >         Insert zero-idiom in output template when attr enabled, set new attribute to
> > >         true for non-mask/maskz insn.
> > >         (avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>):
> > >         Likewise.
> > >         (avx512dq_mul<mode>3<mask_name>): Likewise.
> > >         (<avx2_avx512>_permvar<mode><mask_name>): Likewise.
> > >         (avx2_perm<mode>_1<mask_name>): Likewise.
> > >         (avx512f_perm<mode>_1<mask_name>): Likewise.
> > >         (avx512dq_rangep<mode><mask_name><round_saeonly_name>): Likewise.
> > >         (avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>):
> > >         Likewise.
> > >         (<avx512>_getmant<mode><mask_name><round_saeonly_name>): Likewise.
> > >         (avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
> > >         Likewise.
> > >         * config/i386/subst.md (mask3_dest_false_dep_attr): New subst_attr.
> > >         (mask4_dest_false_dep_attr): Likewise.
> > >         (mask6_dest_false_dep_attr): Likewise.
> > >         (mask10_dest_false_dep_attr): Likewise.
> > >         (maskc_dest_false_dep_attr): Likewise.
> > >         (mask_scalar4_dest_false_dep_attr): Likewise.
> > >         (mask_scalarc_dest_false_dep_attr): Likewise.
> > >         * config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEPENDENCY): New
> > >         DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/avx2-dest-false-dependency.c: New test.
> > >         * gcc.target/i386/avx512dq-dest-false-dependency.c: Ditto.
> > >         * gcc.target/i386/avx512f-dest-false-dependency.c: Ditto.
> > >         * gcc.target/i386/avx512fp16-dest-false-dependency.c: Ditto.
> > >         * gcc.target/i386/avx512fp16vl-dest-false-dependency.c: Ditto.
> > >         * gcc.target/i386/avx512vl-dest-false-dependency.c: Ditto.
> > > ---
> > >  gcc/config/i386/i386.h                        |   2 +
> > >  gcc/config/i386/i386.md                       |   4 +
> > >  gcc/config/i386/sse.md                        | 142 +++++++++++++++---
> > >  gcc/config/i386/subst.md                      |   7 +
> > >  gcc/config/i386/x86-tune.def                  |   5 +
> > >  .../i386/avx2-dest-false-dependency.c         |  24 +++
> > >  .../i386/avx512dq-dest-false-dependency.c     |  73 +++++++++
> > >  .../i386/avx512f-dest-false-dependency.c      | 102 +++++++++++++
> > >  .../i386/avx512fp16-dest-false-dependency.c   |  45 ++++++
> > >  .../i386/avx512fp16vl-dest-false-dependency.c |  24 +++
> > >  .../i386/avx512vl-dest-false-dependency.c     |  76 ++++++++++
> > >  11 files changed, 486 insertions(+), 18 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> > >
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index 3ac0f698ae2..ddbf6b9825a 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> > >         ix86_tune_features[X86_TUNE_EXPAND_ABS]
> > >  #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \
> > >         ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
> > > +#define TARGET_DEST_FALSE_DEPENDENCY \
> > > +       ix86_tune_features[X86_TUNE_DEST_FALSE_DEPENDENCY]
> > >
> > >  /* Feature tests against the various architecture variations.  */
> > >  enum ix86_arch_indices {
> > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > index 9937643a273..40a2b580740 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -823,6 +823,10 @@ (define_attr "i387_cw" "roundeven,floor,ceil,trunc,uninitialized,any"
> > >  (define_attr "avx_partial_xmm_update" "false,true"
> > >    (const_string "false"))
> > >
> > > +;; Define attribute to indicate complex mult insn with false dependency
> > > +(define_attr "dest_false_dep" "false,true"
> > > + (const_string "false"))
> > > +
> > >  ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
> > >  (define_attr "use_carry" "0,1" (const_string "0"))
> > >
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 0864748875e..c8dace5b2f8 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -6536,9 +6536,20 @@ (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
> > >              (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
> > >              UNSPEC_COMPLEX_F_C_MUL))]
> > >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> > > -  "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
> > > +}
> > >    [(set_attr "type" "ssemul")
> > > -   (set_attr "mode" "<MODE>")])
> > > +   (set_attr "mode" "<MODE>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<maskc_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>"
> > >    [(match_operand:V8HF 0 "register_operand")
> > > @@ -6742,9 +6753,20 @@ (define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarc
> > >             (match_dup 1)
> > >             (const_int 3)))]
> > >    "TARGET_AVX512FP16"
> > > -  "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
> > > +}
> > >    [(set_attr "type" "ssemul")
> > > -   (set_attr "mode" "V8HF")])
> > > +   (set_attr "mode" "V8HF")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask_scalarc_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> > >  ;;
> > > @@ -15207,10 +15229,21 @@ (define_insn "avx512dq_mul<mode>3<mask_name>"
> > >           (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))]
> > >    "TARGET_AVX512DQ && <mask_mode512bit_condition>
> > >    && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
> > > -  "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
> > > +}
> > >    [(set_attr "type" "sseimul")
> > >     (set_attr "prefix" "evex")
> > > -   (set_attr "mode" "<sseinsnmode>")])
> > > +   (set_attr "mode" "<sseinsnmode>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask3_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_expand "cond_mul<mode>"
> > >    [(set (match_operand:VI4_AVX512VL 0 "register_operand")
> > > @@ -24636,10 +24669,21 @@ (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
> > >            (match_operand:<sseintvecmode> 2 "register_operand" "v")]
> > >           UNSPEC_VPERMVAR))]
> > >    "TARGET_AVX2 && <mask_mode512bit_condition>"
> > > -  "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
> > > +}
> > >    [(set_attr "type" "sselog")
> > >     (set_attr "prefix" "<mask_prefix2>")
> > > -   (set_attr "mode" "<sseinsnmode>")])
> > > +   (set_attr "mode" "<sseinsnmode>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask3_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_insn "<avx512>_permvar<mode><mask_name>"
> > >    [(set (match_operand:VI1_AVX512VL 0 "register_operand" "=v")
> > > @@ -24873,11 +24917,20 @@ (define_insn "avx2_perm<mode>_1<mask_name>"
> > >    mask |= INTVAL (operands[4]) << 4;
> > >    mask |= INTVAL (operands[5]) << 6;
> > >    operands[2] = GEN_INT (mask);
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > >    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
> > >  }
> > >    [(set_attr "type" "sselog")
> > >     (set_attr "prefix" "<mask_prefix2>")
> > > -   (set_attr "mode" "<sseinsnmode>")])
> > > +   (set_attr "mode" "<sseinsnmode>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask6_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_expand "avx512f_perm<mode>"
> > >    [(match_operand:V8FI 0 "register_operand")
> > > @@ -24944,11 +24997,20 @@ (define_insn "avx512f_perm<mode>_1<mask_name>"
> > >    mask |= INTVAL (operands[4]) << 4;
> > >    mask |= INTVAL (operands[5]) << 6;
> > >    operands[2] = GEN_INT (mask);
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > >    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
> > >  }
> > >    [(set_attr "type" "sselog")
> > >     (set_attr "prefix" "<mask_prefix2>")
> > > -   (set_attr "mode" "<sseinsnmode>")])
> > > +   (set_attr "mode" "<sseinsnmode>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask10_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_insn "avx2_permv2ti"
> > >    [(set (match_operand:V4DI 0 "register_operand" "=x")
> > > @@ -26843,10 +26905,21 @@ (define_insn "avx512dq_rangep<mode><mask_name><round_saeonly_name>"
> > >            (match_operand:SI 3 "const_0_to_15_operand")]
> > >           UNSPEC_RANGE))]
> > >    "TARGET_AVX512DQ && <round_saeonly_mode512bit_condition>"
> > > -  "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}"
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
> > > +}
> > >    [(set_attr "type" "sse")
> > >     (set_attr "prefix" "evex")
> > > -   (set_attr "mode" "<MODE>")])
> > > +   (set_attr "mode" "<MODE>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask4_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>"
> > >    [(set (match_operand:VF_128 0 "register_operand" "=v")
> > > @@ -26859,10 +26932,21 @@ (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>
> > >           (match_dup 1)
> > >           (const_int 1)))]
> > >    "TARGET_AVX512DQ"
> > > -  "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> > > +}
> > >    [(set_attr "type" "sse")
> > >     (set_attr "prefix" "evex")
> > > -   (set_attr "mode" "<MODE>")])
> > > +   (set_attr "mode" "<MODE>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask_scalar4_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
> > >    [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> > > @@ -26899,9 +26983,20 @@ (define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>"
> > >            (match_operand:SI 2 "const_0_to_15_operand")]
> > >           UNSPEC_GETMANT))]
> > >    "TARGET_AVX512F"
> > > -  "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> > > +}
> > >    [(set_attr "prefix" "evex")
> > > -   (set_attr "mode" "<MODE>")])
> > > +   (set_attr "mode" "<MODE>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "!MEM_P (operands[1]) || <mask3_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>"
> > >    [(set (match_operand:VFH_128 0 "register_operand" "=v")
> > > @@ -26914,9 +27009,20 @@ (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name
> > >           (match_dup 1)
> > >           (const_int 1)))]
> > >     "TARGET_AVX512F"
> > > -   "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
> > > +}
> > >     [(set_attr "prefix" "evex")
> > > -   (set_attr "mode" "<ssescalarmode>")])
> > > +   (set_attr "mode" "<ssescalarmode>")
> > > +   (set (attr "dest_false_dep")
> > > +       (if_then_else
> > > +         (match_test "<mask_scalar4_dest_false_dep_attr>")
> > > +         (const_string "false")
> > > +         (const_string "true")))])
> > >
> > >  ;; The correct representation for this is absolutely enormous, and
> > >  ;; surely not generally useful.
> > > diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
> > > index 21d445cc46c..802a8715b01 100644
> > > --- a/gcc/config/i386/subst.md
> > > +++ b/gcc/config/i386/subst.md
> > > @@ -71,6 +71,11 @@ (define_subst_attr "bcst_mask_prefix3" "mask" "orig,maybe_evex" "evex,evex")
> > >  (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex")
> > >  (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex")
> > >  (define_subst_attr "mask_expand_op3" "mask" "3" "5")
> > > +(define_subst_attr "mask3_dest_false_dep_attr" "mask" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
> > > +(define_subst_attr "mask4_dest_false_dep_attr" "mask" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
> > > +(define_subst_attr "mask6_dest_false_dep_attr" "mask" "0" "operands[6] != CONST0_RTX(<MODE>mode)")
> > > +(define_subst_attr "mask10_dest_false_dep_attr" "mask" "0" "operands[10] != CONST0_RTX(<MODE>mode)")
> > > +(define_subst_attr "maskc_dest_false_dep_attr" "maskc" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
> > >
> > >  (define_subst "mask"
> > >    [(set (match_operand:SUBST_V 0)
> > > @@ -337,6 +342,8 @@ (define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3")
> > >  (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
> > >  (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
> > >  (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
> > > +(define_subst_attr "mask_scalar4_dest_false_dep_attr" "mask_scalar" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
> > > +(define_subst_attr "mask_scalarc_dest_false_dep_attr" "mask_scalarc" "0" "operands[3] != CONST0_RTX(V8HFmode)")
> > >
> > >  (define_subst "mask_scalar"
> > >    [(set (match_operand:SUBST_V 0)
> > > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> > > index 0d3fd078068..1b42c96fc38 100644
> > > --- a/gcc/config/i386/x86-tune.def
> > > +++ b/gcc/config/i386/x86-tune.def
> > > @@ -79,6 +79,11 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
> > >           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
> > >           | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
> > >
> > > +/* X86_TUNE_DEST_FALSE_DEPENDENCY: This knob inserts zero-idiom before
> > > +   several insns to break false dependency on the dest register.  */
> > > +DEF_TUNE (X86_TUNE_DEST_FALSE_DEPENDENCY,
> > > +         "dest_false_dependency", m_SAPPHIRERAPIDS | m_ALDERLAKE)
> > > +
> > >  /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
> > >     are resolved on SSE register parts instead of whole registers, so we may
> > >     maintain just lower part of scalar values in proper format leaving the
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> > > new file mode 100644
> > > index 00000000000..e138920ce18
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
> > > @@ -0,0 +1,24 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx2 -mtune-ctrl=dest_false_dependency -O2" } */
> > > +
> > > +
> > > +#include <immintrin.h>
> > > +
> > > +extern __m256i i1, i2, i3, i4;
> > > +extern __m256d d1, d2;
> > > +extern __m256 f1, f2;
> > > +
> > > +void vperm_test (void)
> > > +{
> > > +  i3 = _mm256_permutevar8x32_epi32 (i1, i2);
> > > +  i4 = _mm256_permute4x64_epi64 (i1, 12);
> > > +  d2 = _mm256_permute4x64_pd (d1, 12);
> > > +  f2 = _mm256_permutevar8x32_ps (f1, i2);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "vxorps" 4 } } */
> > > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpermq" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpermpd" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpermps" 1 } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> > > new file mode 100644
> > > index 00000000000..2feb58f2cd8
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
> > > @@ -0,0 +1,73 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512dq -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > > +
> > > +#include <immintrin.h>
> > > +
> > > +extern __m512i i1;
> > > +extern __m256i i2;
> > > +extern __m128i i3;
> > > +extern __m512d d1;
> > > +extern __m256d d2;
> > > +extern __m128d d3;
> > > +extern __m512 f1;
> > > +extern __m256 f2;
> > > +extern __m128 f3;
> > > +
> > > +__mmask32 m32;
> > > +__mmask16 m16;
> > > +__mmask8 m8;
> > > +
> > > +void mullo_test (void)
> > > +{
> > > +  i1 = _mm512_mullo_epi64 (i1, i1);
> > > +  i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1);
> > > +  i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1);
> > > +  i2 = _mm256_mullo_epi64 (i2, i2);
> > > +  i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2);
> > > +  i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2);
> > > +  i3 = _mm_mullo_epi64 (i3, i3);
> > > +  i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3);
> > > +  i3 = _mm_maskz_mullo_epi64 (m8, i3, i3);
> > > +}
> > > +
> > > +void range_test (void)
> > > +{
> > > +  d1 = _mm512_range_pd (d1, d1, 15);
> > > +  d1 = _mm512_range_round_pd (d1, d1, 15, 8);
> > > +  d1 = _mm512_mask_range_pd (d1, m8, d1, d1, 15);
> > > +  d1 = _mm512_mask_range_round_pd (d1, m8, d1, d1, 15, 8);
> > > +  d1 = _mm512_maskz_range_pd (m8, d1, d1, 15);
> > > +  d1 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8);
> > > +  d2 = _mm256_range_pd (d2, d2, 15);
> > > +  d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15);
> > > +  d2 = _mm256_maskz_range_pd (m8, d2, d2, 15);
> > > +  d3 = _mm_range_pd (d3, d3, 15);
> > > +  d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15);
> > > +  d3 = _mm_maskz_range_pd (m8, d3, d3, 15);
> > > +  d3 = _mm_range_sd (d3, d3, 15);
> > > +  d3 = _mm_mask_range_sd (d3, m8, d3, d3, 15);
> > > +  d3 = _mm_maskz_range_sd (m8, d3, d3, 15);
> > > +
> > > +  f1 = _mm512_range_ps (f1, f1, 15);
> > > +  f1 = _mm512_range_round_ps (f1, f1, 15, 8);
> > > +  f1 = _mm512_mask_range_ps (f1, m16, f1, f1, 15);
> > > +  f1 = _mm512_mask_range_round_ps (f1, m16, f1, f1, 15, 8);
> > > +  f1 = _mm512_maskz_range_ps (m16, f1, f1, 15);
> > > +  f1 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8);
> > > +  f2 = _mm256_range_ps (f2, f2, 15);
> > > +  f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15);
> > > +  f2 = _mm256_maskz_range_ps (m8, f2, f2, 15);
> > > +  f3 = _mm_range_ps (f3, f3, 15);
> > > +  f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15);
> > > +  f3 = _mm_maskz_range_ps (m8, f3, f3, 15);
> > > +  f3 = _mm_range_ss (f3, f3, 15);
> > > +  f3 = _mm_mask_range_ss (f3, m8, f3, f3, 15);
> > > +  f3 = _mm_maskz_range_ss (m8, f3, f3, 15);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "vxorps" 26 } } */
> > > +/* { dg-final { scan-assembler-times "vpmullq" 9 } } */
> > > +/* { dg-final { scan-assembler-times "vrangepd" 12 } } */
> > > +/* { dg-final { scan-assembler-times "vrangesd" 3 } } */
> > > +/* { dg-final { scan-assembler-times "vrangeps" 12 } } */
> > > +/* { dg-final { scan-assembler-times "vrangess" 3 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> > > new file mode 100644
> > > index 00000000000..9650839970e
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
> > > @@ -0,0 +1,102 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512f -mtune-ctrl=dest_false_dependency -O2" } */
> > > +
> > > +#include <immintrin.h>
> > > +
> > > +extern __m512i i1;
> > > +extern __m512d d1, *pd1;
> > > +extern __m128d d2;
> > > +extern __m512 f1, *pf1;
> > > +extern __m128 f2;
> > > +
> > > +__mmask16 m16;
> > > +__mmask8 m8;
> > > +
> > > +void vperm_test (void)
> > > +{
> > > +  d1 = _mm512_permutex_pd (d1, 12);
> > > +  d1 = _mm512_mask_permutex_pd (d1, m8, d1, 12);
> > > +  d1 = _mm512_maskz_permutex_pd (m8, d1, 12);
> > > +  d1 = _mm512_permutexvar_pd (i1, d1);
> > > +  d1 = _mm512_mask_permutexvar_pd (d1, m8, i1, d1);
> > > +  d1 = _mm512_maskz_permutexvar_pd (m8, i1, d1);
> > > +
> > > +  f1 = _mm512_permutexvar_ps (i1, f1);
> > > +  f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1);
> > > +  f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1);
> > > +
> > > +  i1 = _mm512_permutexvar_epi64 (i1, i1);
> > > +  i1 = _mm512_mask_permutexvar_epi64 (i1, m8, i1, i1);
> > > +  i1 = _mm512_maskz_permutexvar_epi64 (m8, i1, i1);
> > > +  i1 = _mm512_permutex_epi64 (i1, 12);
> > > +  i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12);
> > > +  i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12);
> > > +
> > > +  i1 = _mm512_permutexvar_epi32 (i1, i1);
> > > +  i1 = _mm512_mask_permutexvar_epi32 (i1, m16, i1, i1);
> > > +  i1 = _mm512_maskz_permutexvar_epi32 (m16, i1, i1);
> > > +}
> > > +
> > > +void getmant_test (void)
> > > +{
> > > +  d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> > > +                         _MM_MANT_SIGN_src);
> > > +  d1 = _mm512_getmant_round_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> > > +                               _MM_MANT_SIGN_src, 8);
> > > +  d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > > +                              _MM_MANT_SIGN_src);
> > > +  d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > > +                                    _MM_MANT_SIGN_src, 8);
> > > +  d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > > +                               _MM_MANT_SIGN_src);
> > > +  d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > > +                                     _MM_MANT_SIGN_src, 8);
> > > +  f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> > > +                         _MM_MANT_SIGN_src);
> > > +  f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> > > +                               _MM_MANT_SIGN_src, 8);
> > > +  f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > > +                              _MM_MANT_SIGN_src);
> > > +  f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > > +                                    _MM_MANT_SIGN_src, 8);
> > > +  f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > > +                               _MM_MANT_SIGN_src);
> > > +  f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
> > > +                                     _MM_MANT_SIGN_src, 8);
> > > +
> > > +  d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
> > > +                      _MM_MANT_SIGN_src);
> > > +  d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
> > > +                            _MM_MANT_SIGN_src, 8);
> > > +  d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > > +                           _MM_MANT_SIGN_src);
> > > +  d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > > +                                 _MM_MANT_SIGN_src, 8);
> > > +  d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > > +                            _MM_MANT_SIGN_src);
> > > +  d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
> > > +                                  _MM_MANT_SIGN_src, 8);
> > > +  f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
> > > +                      _MM_MANT_SIGN_src);
> > > +  f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
> > > +                            _MM_MANT_SIGN_src, 8);
> > > +  f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > > +                           _MM_MANT_SIGN_src);
> > > +  f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > > +                                 _MM_MANT_SIGN_src, 8);
> > > +  f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > > +                            _MM_MANT_SIGN_src);
> > > +  f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
> > > +                                  _MM_MANT_SIGN_src, 8);
> > > +
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "vxorps" 24 } } */
> > > +/* { dg-final { scan-assembler-times "vpermd" 3 } } */
> > > +/* { dg-final { scan-assembler-times "vpermq" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vpermps" 3 } } */
> > > +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> > > new file mode 100644
> > > index 00000000000..793bb66201b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
> > > @@ -0,0 +1,45 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > > +
> > > +#include <immintrin.h>
> > > +
> > > +extern __m512h h1;
> > > +extern __m256h h2;
> > > +extern __m128h h3;
> > > +
> > > +__mmask32 m32;
> > > +__mmask16 m16;
> > > +__mmask8 m8;
> > > +
> > > +void complex_mul_test (void)
> > > +{
> > > +  h1 = _mm512_fmul_pch (h1, h1);
> > > +  h1 = _mm512_fmul_round_pch (h1, h1, 8);
> > > +  h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1);
> > > +  h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8);
> > > +  h1 = _mm512_maskz_fmul_pch (m32, h1, h1);
> > > +  h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11);
> > > +
> > > +  h3 = _mm_fmul_sch (h3, h3);
> > > +  h3 = _mm_fmul_round_sch (h3, h3, 8);
> > > +  h3 = _mm_mask_fmul_sch (h3, m8, h3, h3);
> > > +  h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8);
> > > +  h3 = _mm_maskz_fmul_sch (m8, h3, h3);
> > > +  h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11);
> > > +}
> > > +
> > > +void vgetmant_test (void)
> > > +{
> > > +  h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5,
> > > +                      _MM_MANT_SIGN_src);
> > > +  h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5,
> > > +                           _MM_MANT_SIGN_src);
> > > +  h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5,
> > > +                            _MM_MANT_SIGN_src);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "vxorps" 10 } } */
> > > +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> > > new file mode 100644
> > > index 00000000000..09658905d2d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
> > > @@ -0,0 +1,24 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > > +
> > > +#include <immintrin.h>
> > > +
> > > +extern __m256h h1;
> > > +extern __m128h h2;
> > > +
> > > +__mmask16 m16;
> > > +__mmask8 m8;
> > > +
> > > +void complex_mul_test (void)
> > > +{
> > > +  h1 = _mm256_fmul_pch (h1, h1);
> > > +  h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1);
> > > +  h1 = _mm256_maskz_fmul_pch (m16, h1, h1);
> > > +  h2 = _mm_fmul_pch (h2, h2);
> > > +  h2 = _mm_mask_fmul_pch (h2, m16, h2, h2);
> > > +  h2 = _mm_maskz_fmul_pch (m16, h2, h2);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "vxorps" 4 } } */
> > > +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> > > new file mode 100644
> > > index 00000000000..92717a99837
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
> > > @@ -0,0 +1,76 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512f -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
> > > +
> > > +
> > > +#include <immintrin.h>
> > > +
> > > +extern __m256i i1;
> > > +extern __m256d d1, *pd1;
> > > +extern __m128d d2, *pd2;
> > > +extern __m256 f1, *pf1;
> > > +extern __m128 f2, *pf2;
> > > +
> > > +__mmask16 m16;
> > > +__mmask8 m8;
> > > +
> > > +void vperm_test (void)
> > > +{
> > > +  d1 = _mm256_permutex_pd (d1, 12);
> > > +  d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12);
> > > +  d1 = _mm256_maskz_permutex_pd (m8, d1, 12);
> > > +  d1 = _mm256_permutexvar_pd (i1, d1);
> > > +  d1 = _mm256_mask_permutexvar_pd (d1, m8, i1, d1);
> > > +  d1 = _mm256_maskz_permutexvar_pd (m8, i1, d1);
> > > +
> > > +  f1 = _mm256_permutexvar_ps (i1, f1);
> > > +  f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1);
> > > +  f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1);
> > > +
> > > +  i1 = _mm256_permutexvar_epi64 (i1, i1);
> > > +  i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1);
> > > +  i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1);
> > > +  i1 = _mm256_permutex_epi64 (i1, 12);
> > > +  i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12);
> > > +  i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12);
> > > +
> > > +  i1 = _mm256_permutexvar_epi32 (i1, i1);
> > > +  i1 = _mm256_mask_permutexvar_epi32 (i1, m8, i1, i1);
> > > +  i1 = _mm256_maskz_permutexvar_epi32 (m8, i1, i1);
> > > +}
> > > +
> > > +void getmant_test (void)
> > > +{
> > > +  d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
> > > +                         _MM_MANT_SIGN_src);
> > > +  d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > > +                              _MM_MANT_SIGN_src);
> > > +  d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
> > > +                               _MM_MANT_SIGN_src);
> > > +  d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5,
> > > +                      _MM_MANT_SIGN_src);
> > > +  d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5,
> > > +                           _MM_MANT_SIGN_src);
> > > +  d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5,
> > > +                            _MM_MANT_SIGN_src);
> > > +  f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
> > > +                         _MM_MANT_SIGN_src);
> > > +  f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5,
> > > +                              _MM_MANT_SIGN_src);
> > > +  f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5,
> > > +                               _MM_MANT_SIGN_src);
> > > +  f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5,
> > > +                      _MM_MANT_SIGN_src);
> > > +  f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5,
> > > +                           _MM_MANT_SIGN_src);
> > > +  f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5,
> > > +                            _MM_MANT_SIGN_src);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "vxorps" 20 } } */
> > > +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vpermps" 3 } } */
> > > +/* { dg-final { scan-assembler-times "vpermq" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vpermd" 3 } } */
> > > +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
> > > +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
> > > +
> > > --
> > > 2.18.1
> > >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-14  5:38   ` Hongyu Wang
  2022-01-14  6:03     ` Hongyu Wang
@ 2022-01-14  8:17     ` Uros Bizjak
  1 sibling, 0 replies; 16+ messages in thread
From: Uros Bizjak @ 2022-01-14  8:17 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

On Fri, Jan 14, 2022 at 6:46 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> > No, the approach is wrong. You have to solve output clearing on RTL
> > level, please look at how e.g. tzcnt false dep is solved:
>
> Actually we have considered such approach before, but we found we need
> to break original define_insn to remove the mask/rounding subst,
> since define_split could not adopt subst, and that would add 6 more
> define_insn_and_split and 4 define_insn for each instruction. We think
> such approach would introduce too much redundant code.

Are there any technical obstacles to introduce subst to
define_{,insn_and_}split?

Uros.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-14  6:03     ` Hongyu Wang
@ 2022-01-14  8:37       ` Uros Bizjak
  2022-01-14 13:44         ` Hongyu Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Uros Bizjak @ 2022-01-14  8:37 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

On Fri, Jan 14, 2022 at 7:11 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> > > No, the approach is wrong. You have to solve output clearing on RTL
> > > level, please look at how e.g. tzcnt false dep is solved:
> >
> > Actually we have considered such approach before, but we found we need
> > to break original define_insn to remove the mask/rounding subst,
> > since define_split could not adopt subst, and that would add 6 more
> > define_insn_and_split and 4 define_insn for each instruction. We think
> > such approach would introduce too much redundant code.
> >
> > Do you think the code size increment is acceptable?
>
> Also that 100+ more patterns increases maintenance effort. If we split
> them at epilogue_complete stage,
> it seems not much difference to put it under output template...

In the proposed patch, if the output register is also mentioned in the
input, then clearing before insn will clear the value in the input
register. The solution in the i386.md also takes care of this issue.

Uros.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-14  8:37       ` Uros Bizjak
@ 2022-01-14 13:44         ` Hongyu Wang
  2022-01-14 15:49           ` Uros Bizjak
  0 siblings, 1 reply; 16+ messages in thread
From: Hongyu Wang @ 2022-01-14 13:44 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

> Are there any technical obstacles to introduce subst to
> define_{,insn_and_}split?

gccint says: define_subst can be used only in define_insn and
define_expand, it cannot be used in other expressions (e.g. in
define_insn_and_split).
I have no idea how to implement it in current infrastructure.

> In the proposed patch, if the output register is also mentioned in the
> input, then clearing before insn will clear the value in the input
> register. The solution in the i386.md also takes care of this issue.
>

For this, I think we can add REGNO checks for operands in condition
(which means there is true dependency).

Uros Bizjak <ubizjak@gmail.com> 于2022年1月14日周五 16:37写道:
>
> On Fri, Jan 14, 2022 at 7:11 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> >
> > > > No, the approach is wrong. You have to solve output clearing on RTL
> > > > level, please look at how e.g. tzcnt false dep is solved:
> > >
> > > Actually we have considered such approach before, but we found we need
> > > to break original define_insn to remove the mask/rounding subst,
> > > since define_split could not adopt subst, and that would add 6 more
> > > define_insn_and_split and 4 define_insn for each instruction. We think
> > > such approach would introduce too much redundant code.
> > >
> > > Do you think the code size increment is acceptable?
> >
> > Also that 100+ more patterns increases maintenance effort. If we split
> > them at epilogue_complete stage,
> > it seems not much difference to put it under output template...
>
> In the proposed patch, if the output register is also mentioned in the
> input, then clearing before insn will clear the value in the input
> register. The solution in the i386.md also takes care of this issue.
>
> Uros.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-14 13:44         ` Hongyu Wang
@ 2022-01-14 15:49           ` Uros Bizjak
  2022-01-15 16:39             ` Hongyu Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Uros Bizjak @ 2022-01-14 15:49 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

On Fri, Jan 14, 2022 at 2:44 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> > Are there any technical obstacles to introduce subst to
> > define_{,insn_and_}split?
>
> gccint says: define_subst can be used only in define_insn and
> define_expand, it cannot be used in other expressions (e.g. in
> define_insn_and_split).

Hm, hm ... annoying ...

> I have no idea how to implement it in current infrastructure.
>
> > In the proposed patch, if the output register is also mentioned in the
> > input, then clearing before insn will clear the value in the input
> > register. The solution in the i386.md also takes care of this issue.
> >
>
> For this, I think we can add REGNO checks for operands in condition
> (which means there is true dependency).

Let's go in your direction, considering the limitations of current
infrastructure.
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+        DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1,
%0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+}

There is no need to pass the information via attributes. IMO, you
shoud use subst attribute directly in the condition:

{
  if (TARGET_DEST_FALSE_DEPENDENCY
      && <mask3_dest_false_dep_attr>
      && !reg_mentioned_p (operands[0], operands[1]))
   output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
  return ".....";
}

Assume the above works, so please:

- rename TARGET_DEST_FALSE_DEPENDENCY to something less generic, maybe
following existing BMI example with TARGET_AVOID_FALSE_DEP_FOR_AVX512F
- rename "mask3_dest_false_dep_attr" to "mask3_false_dep_for_avx512f_cond"

Please note reg_mentioned_p in the above condition. This function
returns nonzero if register op0 appears somewhere within op1 and is
critical for the correct operation of your patch.

Uros.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-14 15:49           ` Uros Bizjak
@ 2022-01-15 16:39             ` Hongyu Wang
  2022-01-15 16:43               ` Uros Bizjak
  0 siblings, 1 reply; 16+ messages in thread
From: Hongyu Wang @ 2022-01-15 16:39 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2607 bytes --]

Thanks for the suggestion, here is the updated patch that survived
bootstrap/regtest.

> Please note reg_mentioned_p in the above condition. This function
> returns nonzero if register op0 appears somewhere within op1 and is
> critical for the correct operation of your patch.
I added reg_mentioned_p for all insns except fp16 complex mult, since
they have constraint & to the dest so it must be allocated different
register from src.

Uros Bizjak <ubizjak@gmail.com> 于2022年1月14日周五 23:49写道:


>
> On Fri, Jan 14, 2022 at 2:44 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> >
> > > Are there any technical obstacles to introduce subst to
> > > define_{,insn_and_}split?
> >
> > gccint says: define_subst can be used only in define_insn and
> > define_expand, it cannot be used in other expressions (e.g. in
> > define_insn_and_split).
>
> Hm, hm ... annoying ...
>
> > I have no idea how to implement it in current infrastructure.
> >
> > > In the proposed patch, if the output register is also mentioned in the
> > > input, then clearing before insn will clear the value in the input
> > > register. The solution in the i386.md also takes care of this issue.
> > >
> >
> > For this, I think we can add REGNO checks for operands in condition
> > (which means there is true dependency).
>
> Let's go in your direction, considering the limitations of current
> infrastructure.
> +{
> +  if (TARGET_DEST_FALSE_DEPENDENCY
> +      && get_attr_dest_false_dep (insn) ==
> +        DEST_FALSE_DEP_TRUE)
> +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1,
> %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> +}
>
> There is no need to pass the information via attributes. IMO, you
> shoud use subst attribute directly in the condition:
>
> {
>   if (TARGET_DEST_FALSE_DEPENDENCY
>       && <mask3_dest_false_dep_attr>
>       && !reg_mentioned_p (operands[0], operands[1]))
>    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
>   return ".....";
> }
>
> Assume the above works, so please:
>
> - rename TARGET_DEST_FALSE_DEPENDENCY to something less generic, maybe
> following existing BMI example with TARGET_AVOID_FALSE_DEP_FOR_AVX512F
> - rename "mask3_dest_false_dep_attr" to "mask3_false_dep_for_avx512f_cond"
>
> Please note reg_mentioned_p in the above condition. This function
> returns nonzero if register op0 appears somewhere within op1 and is
> critical for the correct operation of your patch.
>
> Uros.

[-- Attachment #2: 0001-i386-GLC-tuning-Break-false-dependency-for-dest-regi.patch --]
[-- Type: application/x-patch, Size: 27970 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-15 16:39             ` Hongyu Wang
@ 2022-01-15 16:43               ` Uros Bizjak
  2022-01-16  4:22                 ` Hongtao Liu
  0 siblings, 1 reply; 16+ messages in thread
From: Uros Bizjak @ 2022-01-15 16:43 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

On Sat, Jan 15, 2022 at 5:39 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> Thanks for the suggestion, here is the updated patch that survived
> bootstrap/regtest.

LGTM for me, but please get the final approval from Hongtao.

Thanks,
Uros.

> > Please note reg_mentioned_p in the above condition. This function
> > returns nonzero if register op0 appears somewhere within op1 and is
> > critical for the correct operation of your patch.
> I added reg_mentioned_p for all insns except fp16 complex mult, since
> they have constraint & to the dest so it must be allocated different
> register from src.
>
> Uros Bizjak <ubizjak@gmail.com> 于2022年1月14日周五 23:49写道:
>
>
> >
> > On Fri, Jan 14, 2022 at 2:44 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > >
> > > > Are there any technical obstacles to introduce subst to
> > > > define_{,insn_and_}split?
> > >
> > > gccint says: define_subst can be used only in define_insn and
> > > define_expand, it cannot be used in other expressions (e.g. in
> > > define_insn_and_split).
> >
> > Hm, hm ... annoying ...
> >
> > > I have no idea how to implement it in current infrastructure.
> > >
> > > > In the proposed patch, if the output register is also mentioned in the
> > > > input, then clearing before insn will clear the value in the input
> > > > register. The solution in the i386.md also takes care of this issue.
> > > >
> > >
> > > For this, I think we can add REGNO checks for operands in condition
> > > (which means there is true dependency).
> >
> > Let's go in your direction, considering the limitations of current
> > infrastructure.
> > +{
> > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > +      && get_attr_dest_false_dep (insn) ==
> > +        DEST_FALSE_DEP_TRUE)
> > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > +  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1,
> > %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> > +}
> >
> > There is no need to pass the information via attributes. IMO, you
> > shoud use subst attribute directly in the condition:
> >
> > {
> >   if (TARGET_DEST_FALSE_DEPENDENCY
> >       && <mask3_dest_false_dep_attr>
> >       && !reg_mentioned_p (operands[0], operands[1]))
> >    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> >   return ".....";
> > }
> >
> > Assume the above works, so please:
> >
> > - rename TARGET_DEST_FALSE_DEPENDENCY to something less generic, maybe
> > following existing BMI example with TARGET_AVOID_FALSE_DEP_FOR_AVX512F
> > - rename "mask3_dest_false_dep_attr" to "mask3_false_dep_for_avx512f_cond"
> >
> > Please note reg_mentioned_p in the above condition. This function
> > returns nonzero if register op0 appears somewhere within op1 and is
> > critical for the correct operation of your patch.
> >
> > Uros.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [i386] GLC tuning: Break false dependency for dest register.
  2022-01-15 16:43               ` Uros Bizjak
@ 2022-01-16  4:22                 ` Hongtao Liu
  2022-01-19  0:00                   ` [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104] Jakub Jelinek
  0 siblings, 1 reply; 16+ messages in thread
From: Hongtao Liu @ 2022-01-16  4:22 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches, Hongyu Wang

On Sun, Jan 16, 2022 at 12:44 AM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Sat, Jan 15, 2022 at 5:39 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> >
> > Thanks for the suggestion, here is the updated patch that survived
> > bootstrap/regtest.
>
> LGTM for me, but please get the final approval from Hongtao.
>
Ok, thanks.
> Thanks,
> Uros.
>
> > > Please note reg_mentioned_p in the above condition. This function
> > > returns nonzero if register op0 appears somewhere within op1 and is
> > > critical for the correct operation of your patch.
> > I added reg_mentioned_p for all insns except fp16 complex mult, since
> > they have constraint & to the dest so it must be allocated different
> > register from src.
> >
> > Uros Bizjak <ubizjak@gmail.com> 于2022年1月14日周五 23:49写道:
> >
> >
> > >
> > > On Fri, Jan 14, 2022 at 2:44 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > > >
> > > > > Are there any technical obstacles to introduce subst to
> > > > > define_{,insn_and_}split?
> > > >
> > > > gccint says: define_subst can be used only in define_insn and
> > > > define_expand, it cannot be used in other expressions (e.g. in
> > > > define_insn_and_split).
> > >
> > > Hm, hm ... annoying ...
> > >
> > > > I have no idea how to implement it in current infrastructure.
> > > >
> > > > > In the proposed patch, if the output register is also mentioned in the
> > > > > input, then clearing before insn will clear the value in the input
> > > > > register. The solution in the i386.md also takes care of this issue.
> > > > >
> > > >
> > > > For this, I think we can add REGNO checks for operands in condition
> > > > (which means there is true dependency).
> > >
> > > Let's go in your direction, considering the limitations of current
> > > infrastructure.
> > > +{
> > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > +      && get_attr_dest_false_dep (insn) ==
> > > +        DEST_FALSE_DEP_TRUE)
> > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > > +  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1,
> > > %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
> > > +}
> > >
> > > There is no need to pass the information via attributes. IMO, you
> > > shoud use subst attribute directly in the condition:
> > >
> > > {
> > >   if (TARGET_DEST_FALSE_DEPENDENCY
> > >       && <mask3_dest_false_dep_attr>
> > >       && !reg_mentioned_p (operands[0], operands[1]))
> > >    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> > >   return ".....";
> > > }
> > >
> > > Assume the above works, so please:
> > >
> > > - rename TARGET_DEST_FALSE_DEPENDENCY to something less generic, maybe
> > > following existing BMI example with TARGET_AVOID_FALSE_DEP_FOR_AVX512F
> > > - rename "mask3_dest_false_dep_attr" to "mask3_false_dep_for_avx512f_cond"
> > >
> > > Please note reg_mentioned_p in the above condition. This function
> > > returns nonzero if register op0 appears somewhere within op1 and is
> > > critical for the correct operation of your patch.
> > >
> > > Uros.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104]
  2022-01-16  4:22                 ` Hongtao Liu
@ 2022-01-19  0:00                   ` Jakub Jelinek
  2022-01-19  1:01                     ` Wang, Hongyu
  2022-01-19  1:09                     ` Hongtao Liu
  0 siblings, 2 replies; 16+ messages in thread
From: Jakub Jelinek @ 2022-01-19  0:00 UTC (permalink / raw)
  To: Hongtao Liu, Uros Bizjak; +Cc: gcc-patches, Hongyu Wang

On Sun, Jan 16, 2022 at 12:22:18PM +0800, Hongtao Liu via Gcc-patches wrote:
> On Sun, Jan 16, 2022 at 12:44 AM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Sat, Jan 15, 2022 at 5:39 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > >
> > > Thanks for the suggestion, here is the updated patch that survived
> > > bootstrap/regtest.
> >
> > LGTM for me, but please get the final approval from Hongtao.
> >
> Ok, thanks.

Unfortunately the patch results in assembler failures with -masm=intel.

> > > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > > +      && get_attr_dest_false_dep (insn) ==
> > > > +        DEST_FALSE_DEP_TRUE)
> > > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);

All the vxorps insns were emitted like the above, which means for -masm=sysv
it looks like
	vxorps	%xmm3, %xmm3, %xmm3
but for -masm=intel like:
	vxorps	
We want obviously
	vxorps	xmm3, xmm3, xmm3
so the following patch just drops the errorneous {}s.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-01-19  Jakub Jelinek  <jakub@redhat.com>

	PR target/104104
	* config/i386/sse.md
	(<avx512>_<complexopname>_<mode><maskc_name><round_name>,
	avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>,
	avx512dq_mul<mode>3<mask_name>, <avx2_avx512>_permvar<mode><mask_name>,
	avx2_perm<mode>_1<mask_name>, avx512f_perm<mode>_1<mask_name>,
	avx512dq_rangep<mode><mask_name><round_saeonly_name>,
	avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>,
	<avx512>_getmant<mode><mask_name><round_saeonly_name>,
	avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
	Use vxorps\t%x0, %x0, %x0 instead of vxorps\t{%x0, %x0, %x0}.

	* gcc.target/i386/pr104104.c: New test.

--- gcc/config/i386/sse.md.jj	2022-01-18 11:58:59.156988142 +0100
+++ gcc/config/i386/sse.md	2022-01-18 21:20:40.022477778 +0100
@@ -6539,7 +6539,7 @@ (define_insn "<avx512>_<complexopname>_<
 {
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <maskc_dest_false_dep_for_glc_cond>)
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
 }
   [(set_attr "type" "ssemul")
@@ -6750,7 +6750,7 @@ (define_insn "avx512fp16_<complexopname>
 {
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask_scalarc_dest_false_dep_for_glc_cond>)
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
 }
   [(set_attr "type" "ssemul")
@@ -15222,7 +15222,7 @@ (define_insn "avx512dq_mul<mode>3<mask_n
       && <mask3_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
 }
   [(set_attr "type" "sseimul")
@@ -24658,7 +24658,7 @@ (define_insn "<avx2_avx512>_permvar<mode
       && <mask3_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
 }
   [(set_attr "type" "sselog")
@@ -24900,7 +24900,7 @@ (define_insn "avx2_perm<mode>_1<mask_nam
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask6_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
 }
   [(set_attr "type" "sselog")
@@ -24975,7 +24975,7 @@ (define_insn "avx512f_perm<mode>_1<mask_
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask10_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
 }
   [(set_attr "type" "sselog")
@@ -26880,7 +26880,7 @@ (define_insn "avx512dq_rangep<mode><mask
       && <mask4_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
 }
   [(set_attr "type" "sse")
@@ -26903,7 +26903,7 @@ (define_insn "avx512dq_ranges<mode><mask
       && <mask_scalar4_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
 }
   [(set_attr "type" "sse")
@@ -26949,7 +26949,7 @@ (define_insn "<avx512>_getmant<mode><mas
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask3_dest_false_dep_for_glc_cond>
       && MEM_P (operands[1]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
 }
   [(set_attr "prefix" "evex")
@@ -26971,7 +26971,7 @@ (define_insn "avx512f_vgetmant<mode><mas
       && <mask_scalar4_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
 }
    [(set_attr "prefix" "evex")
--- gcc/testsuite/gcc.target/i386/pr104104.c.jj	2022-01-18 21:38:17.007906673 +0100
+++ gcc/testsuite/gcc.target/i386/pr104104.c	2022-01-18 21:36:10.475623148 +0100
@@ -0,0 +1,10 @@
+/* PR target/104104 */
+/* { dg-do assemble { target vect_simd_clones } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-march=alderlake -masm=intel -O1 -fallow-store-data-races -funroll-all-loops" } */
+
+__attribute__ ((simd)) short int
+foo (void)
+{
+  return 0;
+}


	Jakub


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104]
  2022-01-19  0:00                   ` [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104] Jakub Jelinek
@ 2022-01-19  1:01                     ` Wang, Hongyu
  2022-01-19  1:09                     ` Hongtao Liu
  1 sibling, 0 replies; 16+ messages in thread
From: Wang, Hongyu @ 2022-01-19  1:01 UTC (permalink / raw)
  To: Jakub Jelinek, Hongtao Liu, Uros Bizjak; +Cc: gcc-patches

Sorry for introducing such failure and thanks for the patch, I suppose it could be treated as obvious fix?
________________________________
发件人: Jakub Jelinek <jakub@redhat.com>
发送时间: 星期三, 一月 19, 2022 8:01 上午
收件人: Hongtao Liu; Uros Bizjak
抄送: gcc-patches@gcc.gnu.org; Wang, Hongyu
主题: [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104]

On Sun, Jan 16, 2022 at 12:22:18PM +0800, Hongtao Liu via Gcc-patches wrote:
> On Sun, Jan 16, 2022 at 12:44 AM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Sat, Jan 15, 2022 at 5:39 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > >
> > > Thanks for the suggestion, here is the updated patch that survived
> > > bootstrap/regtest.
> >
> > LGTM for me, but please get the final approval from Hongtao.
> >
> Ok, thanks.

Unfortunately the patch results in assembler failures with -masm=intel.

> > > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > > +      && get_attr_dest_false_dep (insn) ==
> > > > +        DEST_FALSE_DEP_TRUE)
> > > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);

All the vxorps insns were emitted like the above, which means for -masm=sysv
it looks like
        vxorps  %xmm3, %xmm3, %xmm3
but for -masm=intel like:
        vxorps
We want obviously
        vxorps  xmm3, xmm3, xmm3
so the following patch just drops the errorneous {}s.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-01-19  Jakub Jelinek  <jakub@redhat.com>

        PR target/104104
        * config/i386/sse.md
        (<avx512>_<complexopname>_<mode><maskc_name><round_name>,
        avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>,
        avx512dq_mul<mode>3<mask_name>, <avx2_avx512>_permvar<mode><mask_name>,
        avx2_perm<mode>_1<mask_name>, avx512f_perm<mode>_1<mask_name>,
        avx512dq_rangep<mode><mask_name><round_saeonly_name>,
        avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>,
        <avx512>_getmant<mode><mask_name><round_saeonly_name>,
        avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
        Use vxorps\t%x0, %x0, %x0 instead of vxorps\t{%x0, %x0, %x0}.

        * gcc.target/i386/pr104104.c: New test.

--- gcc/config/i386/sse.md.jj   2022-01-18 11:58:59.156988142 +0100
+++ gcc/config/i386/sse.md      2022-01-18 21:20:40.022477778 +0100
@@ -6539,7 +6539,7 @@ (define_insn "<avx512>_<complexopname>_<
 {
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <maskc_dest_false_dep_for_glc_cond>)
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
 }
   [(set_attr "type" "ssemul")
@@ -6750,7 +6750,7 @@ (define_insn "avx512fp16_<complexopname>
 {
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask_scalarc_dest_false_dep_for_glc_cond>)
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
 }
   [(set_attr "type" "ssemul")
@@ -15222,7 +15222,7 @@ (define_insn "avx512dq_mul<mode>3<mask_n
       && <mask3_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
 }
   [(set_attr "type" "sseimul")
@@ -24658,7 +24658,7 @@ (define_insn "<avx2_avx512>_permvar<mode
       && <mask3_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
 }
   [(set_attr "type" "sselog")
@@ -24900,7 +24900,7 @@ (define_insn "avx2_perm<mode>_1<mask_nam
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask6_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
 }
   [(set_attr "type" "sselog")
@@ -24975,7 +24975,7 @@ (define_insn "avx512f_perm<mode>_1<mask_
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask10_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
 }
   [(set_attr "type" "sselog")
@@ -26880,7 +26880,7 @@ (define_insn "avx512dq_rangep<mode><mask
       && <mask4_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
 }
   [(set_attr "type" "sse")
@@ -26903,7 +26903,7 @@ (define_insn "avx512dq_ranges<mode><mask
       && <mask_scalar4_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
 }
   [(set_attr "type" "sse")
@@ -26949,7 +26949,7 @@ (define_insn "<avx512>_getmant<mode><mas
   if (TARGET_DEST_FALSE_DEP_FOR_GLC
       && <mask3_dest_false_dep_for_glc_cond>
       && MEM_P (operands[1]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
 }
   [(set_attr "prefix" "evex")
@@ -26971,7 +26971,7 @@ (define_insn "avx512f_vgetmant<mode><mas
       && <mask_scalar4_dest_false_dep_for_glc_cond>
       && !reg_mentioned_p (operands[0], operands[1])
       && !reg_mentioned_p (operands[0], operands[2]))
-    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
   return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
 }
    [(set_attr "prefix" "evex")
--- gcc/testsuite/gcc.target/i386/pr104104.c.jj 2022-01-18 21:38:17.007906673 +0100
+++ gcc/testsuite/gcc.target/i386/pr104104.c    2022-01-18 21:36:10.475623148 +0100
@@ -0,0 +1,10 @@
+/* PR target/104104 */
+/* { dg-do assemble { target vect_simd_clones } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-march=alderlake -masm=intel -O1 -fallow-store-data-races -funroll-all-loops" } */
+
+__attribute__ ((simd)) short int
+foo (void)
+{
+  return 0;
+}


        Jakub


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104]
  2022-01-19  0:00                   ` [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104] Jakub Jelinek
  2022-01-19  1:01                     ` Wang, Hongyu
@ 2022-01-19  1:09                     ` Hongtao Liu
  2022-01-19  1:40                       ` [PATCH] i386: Fix *aes<aeswideklvariant>u8 Jakub Jelinek
  1 sibling, 1 reply; 16+ messages in thread
From: Hongtao Liu @ 2022-01-19  1:09 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, GCC Patches, Hongyu Wang

On Wed, Jan 19, 2022 at 8:00 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Sun, Jan 16, 2022 at 12:22:18PM +0800, Hongtao Liu via Gcc-patches wrote:
> > On Sun, Jan 16, 2022 at 12:44 AM Uros Bizjak via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Sat, Jan 15, 2022 at 5:39 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > > >
> > > > Thanks for the suggestion, here is the updated patch that survived
> > > > bootstrap/regtest.
> > >
> > > LGTM for me, but please get the final approval from Hongtao.
> > >
> > Ok, thanks.
>
> Unfortunately the patch results in assembler failures with -masm=intel.
>
> > > > > +  if (TARGET_DEST_FALSE_DEPENDENCY
> > > > > +      && get_attr_dest_false_dep (insn) ==
> > > > > +        DEST_FALSE_DEP_TRUE)
> > > > > +    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
>
> All the vxorps insns were emitted like the above, which means for -masm=sysv
> it looks like
>         vxorps  %xmm3, %xmm3, %xmm3
> but for -masm=intel like:
>         vxorps
> We want obviously
>         vxorps  xmm3, xmm3, xmm3
> so the following patch just drops the errorneous {}s.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Yes, thanks.
>
> 2022-01-19  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/104104
>         * config/i386/sse.md
>         (<avx512>_<complexopname>_<mode><maskc_name><round_name>,
>         avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>,
>         avx512dq_mul<mode>3<mask_name>, <avx2_avx512>_permvar<mode><mask_name>,
>         avx2_perm<mode>_1<mask_name>, avx512f_perm<mode>_1<mask_name>,
>         avx512dq_rangep<mode><mask_name><round_saeonly_name>,
>         avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>,
>         <avx512>_getmant<mode><mask_name><round_saeonly_name>,
>         avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
>         Use vxorps\t%x0, %x0, %x0 instead of vxorps\t{%x0, %x0, %x0}.
>
>         * gcc.target/i386/pr104104.c: New test.
>
> --- gcc/config/i386/sse.md.jj   2022-01-18 11:58:59.156988142 +0100
> +++ gcc/config/i386/sse.md      2022-01-18 21:20:40.022477778 +0100
> @@ -6539,7 +6539,7 @@ (define_insn "<avx512>_<complexopname>_<
>  {
>    if (TARGET_DEST_FALSE_DEP_FOR_GLC
>        && <maskc_dest_false_dep_for_glc_cond>)
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
>  }
>    [(set_attr "type" "ssemul")
> @@ -6750,7 +6750,7 @@ (define_insn "avx512fp16_<complexopname>
>  {
>    if (TARGET_DEST_FALSE_DEP_FOR_GLC
>        && <mask_scalarc_dest_false_dep_for_glc_cond>)
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
>  }
>    [(set_attr "type" "ssemul")
> @@ -15222,7 +15222,7 @@ (define_insn "avx512dq_mul<mode>3<mask_n
>        && <mask3_dest_false_dep_for_glc_cond>
>        && !reg_mentioned_p (operands[0], operands[1])
>        && !reg_mentioned_p (operands[0], operands[2]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
>  }
>    [(set_attr "type" "sseimul")
> @@ -24658,7 +24658,7 @@ (define_insn "<avx2_avx512>_permvar<mode
>        && <mask3_dest_false_dep_for_glc_cond>
>        && !reg_mentioned_p (operands[0], operands[1])
>        && !reg_mentioned_p (operands[0], operands[2]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
>  }
>    [(set_attr "type" "sselog")
> @@ -24900,7 +24900,7 @@ (define_insn "avx2_perm<mode>_1<mask_nam
>    if (TARGET_DEST_FALSE_DEP_FOR_GLC
>        && <mask6_dest_false_dep_for_glc_cond>
>        && !reg_mentioned_p (operands[0], operands[1]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
>  }
>    [(set_attr "type" "sselog")
> @@ -24975,7 +24975,7 @@ (define_insn "avx512f_perm<mode>_1<mask_
>    if (TARGET_DEST_FALSE_DEP_FOR_GLC
>        && <mask10_dest_false_dep_for_glc_cond>
>        && !reg_mentioned_p (operands[0], operands[1]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
>  }
>    [(set_attr "type" "sselog")
> @@ -26880,7 +26880,7 @@ (define_insn "avx512dq_rangep<mode><mask
>        && <mask4_dest_false_dep_for_glc_cond>
>        && !reg_mentioned_p (operands[0], operands[1])
>        && !reg_mentioned_p (operands[0], operands[2]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
>  }
>    [(set_attr "type" "sse")
> @@ -26903,7 +26903,7 @@ (define_insn "avx512dq_ranges<mode><mask
>        && <mask_scalar4_dest_false_dep_for_glc_cond>
>        && !reg_mentioned_p (operands[0], operands[1])
>        && !reg_mentioned_p (operands[0], operands[2]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
>  }
>    [(set_attr "type" "sse")
> @@ -26949,7 +26949,7 @@ (define_insn "<avx512>_getmant<mode><mas
>    if (TARGET_DEST_FALSE_DEP_FOR_GLC
>        && <mask3_dest_false_dep_for_glc_cond>
>        && MEM_P (operands[1]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
>  }
>    [(set_attr "prefix" "evex")
> @@ -26971,7 +26971,7 @@ (define_insn "avx512f_vgetmant<mode><mas
>        && <mask_scalar4_dest_false_dep_for_glc_cond>
>        && !reg_mentioned_p (operands[0], operands[1])
>        && !reg_mentioned_p (operands[0], operands[2]))
> -    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
> +    output_asm_insn ("vxorps\t%x0, %x0, %x0", operands);
>    return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
>  }
>     [(set_attr "prefix" "evex")
> --- gcc/testsuite/gcc.target/i386/pr104104.c.jj 2022-01-18 21:38:17.007906673 +0100
> +++ gcc/testsuite/gcc.target/i386/pr104104.c    2022-01-18 21:36:10.475623148 +0100
> @@ -0,0 +1,10 @@
> +/* PR target/104104 */
> +/* { dg-do assemble { target vect_simd_clones } } */
> +/* { dg-require-effective-target masm_intel } */
> +/* { dg-options "-march=alderlake -masm=intel -O1 -fallow-store-data-races -funroll-all-loops" } */
> +
> +__attribute__ ((simd)) short int
> +foo (void)
> +{
> +  return 0;
> +}
>
>
>         Jakub
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] i386: Fix *aes<aeswideklvariant>u8
  2022-01-19  1:09                     ` Hongtao Liu
@ 2022-01-19  1:40                       ` Jakub Jelinek
  2022-01-19  1:47                         ` Hongtao Liu
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Jelinek @ 2022-01-19  1:40 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Uros Bizjak, GCC Patches, Hongyu Wang

Hi!

On Wed, Jan 19, 2022 at 09:09:41AM +0800, Hongtao Liu wrote:
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> Yes, thanks.

Thanks.  Committed.
grep '{[^|}]*}"' *.md

found another spot, though dunno if we have sufficient effective targets
etc. to add an -masm=intel test for it (and my installed binutils doesn't
support it anyway).
Binutils trunk testsuite shows the argument isn't omitted even in the Intel
syntax:
grep aesencwide *.s
keylocker.s:	aesencwide128kl	126(%edx)
keylocker.s:	aesencwide256kl	126(%edx)
keylocker.s:	aesencwide128kl	[edx+126]
keylocker.s:	aesencwide256kl	[edx+126]
property-10.s:	 aesencwide128kl (%eax)
x86-64-keylocker.s:	aesencwide128kl	126(%rdx)
x86-64-keylocker.s:	aesencwide256kl	126(%rdx)
x86-64-keylocker.s:	aesencwide128kl	[rdx+126]
x86-64-keylocker.s:	aesencwide256kl	[rdx+126]
and doesn't use any WHATEVER PTR.

Ok for trunk?

2022-01-19  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (*aes<aeswideklvariant>u*): Use %0 instead of
	{%0}.

--- gcc/config/i386/sse.md.jj	2022-01-19 02:23:59.762022574 +0100
+++ gcc/config/i386/sse.md	2022-01-19 02:32:07.953324150 +0100
@@ -28437,7 +28437,7 @@ (define_insn "*aes<aeswideklvariant>u8"
 	    [(match_operand:BLK 0 "memory_operand" "m")]
 	    AESDECENCWIDEKL))])]
   "TARGET_WIDEKL"
-  "aes<aeswideklvariant>\t{%0}"
+  "aes<aeswideklvariant>\t%0"
   [(set_attr "type" "other")])
 
 ;; Modes handled by broadcast patterns.  NB: Allow V64QI and V32HI with


	Jakub


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] i386: Fix *aes<aeswideklvariant>u8
  2022-01-19  1:40                       ` [PATCH] i386: Fix *aes<aeswideklvariant>u8 Jakub Jelinek
@ 2022-01-19  1:47                         ` Hongtao Liu
  0 siblings, 0 replies; 16+ messages in thread
From: Hongtao Liu @ 2022-01-19  1:47 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, GCC Patches, Hongyu Wang

On Wed, Jan 19, 2022 at 9:40 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> On Wed, Jan 19, 2022 at 09:09:41AM +0800, Hongtao Liu wrote:
> > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> > Yes, thanks.
>
> Thanks.  Committed.
> grep '{[^|}]*}"' *.md
>
> found another spot, though dunno if we have sufficient effective targets
> etc. to add an -masm=intel test for it (and my installed binutils doesn't
> support it anyway).
> Binutils trunk testsuite shows the argument isn't omitted even in the Intel
> syntax:
> grep aesencwide *.s
> keylocker.s:    aesencwide128kl 126(%edx)
> keylocker.s:    aesencwide256kl 126(%edx)
> keylocker.s:    aesencwide128kl [edx+126]
> keylocker.s:    aesencwide256kl [edx+126]
> property-10.s:   aesencwide128kl (%eax)
> x86-64-keylocker.s:     aesencwide128kl 126(%rdx)
> x86-64-keylocker.s:     aesencwide256kl 126(%rdx)
> x86-64-keylocker.s:     aesencwide128kl [rdx+126]
> x86-64-keylocker.s:     aesencwide256kl [rdx+126]
> and doesn't use any WHATEVER PTR.
>
> Ok for trunk?
Yes, thanks again.
>
> 2022-01-19  Jakub Jelinek  <jakub@redhat.com>
>
>         * config/i386/sse.md (*aes<aeswideklvariant>u*): Use %0 instead of
>         {%0}.
>
> --- gcc/config/i386/sse.md.jj   2022-01-19 02:23:59.762022574 +0100
> +++ gcc/config/i386/sse.md      2022-01-19 02:32:07.953324150 +0100
> @@ -28437,7 +28437,7 @@ (define_insn "*aes<aeswideklvariant>u8"
>             [(match_operand:BLK 0 "memory_operand" "m")]
>             AESDECENCWIDEKL))])]
>    "TARGET_WIDEKL"
> -  "aes<aeswideklvariant>\t{%0}"
> +  "aes<aeswideklvariant>\t%0"
>    [(set_attr "type" "other")])
>
>  ;; Modes handled by broadcast patterns.  NB: Allow V64QI and V32HI with
>
>
>         Jakub
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-01-19  1:47 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-13  7:28 [PATCH] [i386] GLC tuning: Break false dependency for dest register Hongyu Wang
2022-01-13  7:41 ` Uros Bizjak
2022-01-14  5:38   ` Hongyu Wang
2022-01-14  6:03     ` Hongyu Wang
2022-01-14  8:37       ` Uros Bizjak
2022-01-14 13:44         ` Hongyu Wang
2022-01-14 15:49           ` Uros Bizjak
2022-01-15 16:39             ` Hongyu Wang
2022-01-15 16:43               ` Uros Bizjak
2022-01-16  4:22                 ` Hongtao Liu
2022-01-19  0:00                   ` [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104] Jakub Jelinek
2022-01-19  1:01                     ` Wang, Hongyu
2022-01-19  1:09                     ` Hongtao Liu
2022-01-19  1:40                       ` [PATCH] i386: Fix *aes<aeswideklvariant>u8 Jakub Jelinek
2022-01-19  1:47                         ` Hongtao Liu
2022-01-14  8:17     ` [PATCH] [i386] GLC tuning: Break false dependency for dest register Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).