public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [i386] GLC tuning: Break false dependency for dest register.
@ 2022-01-13  7:28 Hongyu Wang
  2022-01-13  7:41 ` Uros Bizjak
  0 siblings, 1 reply; 16+ messages in thread
From: Hongyu Wang @ 2022-01-13  7:28 UTC (permalink / raw)
  To: hongtao.liu; +Cc: gcc-patches, ubizjak, wwwhhhyyy

From: wwwhhhyyy <hongyu.wang@intel.com>

Hi,

For GoldenCove micro-architecture, force insert zero-idiom in asm
template to break false dependency of dest register for several insns.

The related insns are:

VPERM/D/Q/PS/PD
VRANGEPD/PS/SD/SS
VGETMANTSS/SD/SH
VGETMANDPS/PD - mem version only
VPMULLQ
VFMULCSH/PH
VFCMULCSH/PH

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}

Ok for master?

gcc/ChangeLog:

	* config/i386/i386.h (TARGET_DEST_FALSE_DEPENDENCY): New macro.
	* config/i386/i386.md (dest_false_dep): New define_attr.
	* config/i386/sse.md (<avx512>_<complexopname>_<mode><maskc_name><round_name>):
	Insert zero-idiom in output template when attr enabled, set new attribute to
	true for non-mask/maskz insn.
	(avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>):
	Likewise.
	(avx512dq_mul<mode>3<mask_name>): Likewise.
	(<avx2_avx512>_permvar<mode><mask_name>): Likewise.
	(avx2_perm<mode>_1<mask_name>): Likewise.
	(avx512f_perm<mode>_1<mask_name>): Likewise.
	(avx512dq_rangep<mode><mask_name><round_saeonly_name>): Likewise.
	(avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>):
	Likewise.
	(<avx512>_getmant<mode><mask_name><round_saeonly_name>): Likewise.
	(avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
	Likewise.
	* config/i386/subst.md (mask3_dest_false_dep_attr): New subst_attr.
	(mask4_dest_false_dep_attr): Likewise.
	(mask6_dest_false_dep_attr): Likewise.
	(mask10_dest_false_dep_attr): Likewise.
	(maskc_dest_false_dep_attr): Likewise.
	(mask_scalar4_dest_false_dep_attr): Likewise.
	(mask_scalarc_dest_false_dep_attr): Likewise.
	* config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEPENDENCY): New
	DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx2-dest-false-dependency.c: New test.
	* gcc.target/i386/avx512dq-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512f-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512fp16-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512fp16vl-dest-false-dependency.c: Ditto.
	* gcc.target/i386/avx512vl-dest-false-dependency.c: Ditto.
---
 gcc/config/i386/i386.h                        |   2 +
 gcc/config/i386/i386.md                       |   4 +
 gcc/config/i386/sse.md                        | 142 +++++++++++++++---
 gcc/config/i386/subst.md                      |   7 +
 gcc/config/i386/x86-tune.def                  |   5 +
 .../i386/avx2-dest-false-dependency.c         |  24 +++
 .../i386/avx512dq-dest-false-dependency.c     |  73 +++++++++
 .../i386/avx512f-dest-false-dependency.c      | 102 +++++++++++++
 .../i386/avx512fp16-dest-false-dependency.c   |  45 ++++++
 .../i386/avx512fp16vl-dest-false-dependency.c |  24 +++
 .../i386/avx512vl-dest-false-dependency.c     |  76 ++++++++++
 11 files changed, 486 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3ac0f698ae2..ddbf6b9825a 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_EXPAND_ABS]
 #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \
 	ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
+#define TARGET_DEST_FALSE_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_DEST_FALSE_DEPENDENCY]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9937643a273..40a2b580740 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -823,6 +823,10 @@ (define_attr "i387_cw" "roundeven,floor,ceil,trunc,uninitialized,any"
 (define_attr "avx_partial_xmm_update" "false,true"
   (const_string "false"))
 
+;; Define attribute to indicate complex mult insn with false dependency
+(define_attr "dest_false_dep" "false,true"
+ (const_string "false"))
+
 ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
 (define_attr "use_carry" "0,1" (const_string "0"))
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0864748875e..c8dace5b2f8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6536,9 +6536,20 @@ (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
 	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
 	     UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
-  "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
+}
   [(set_attr "type" "ssemul")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<maskc_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>"
   [(match_operand:V8HF 0 "register_operand")
@@ -6742,9 +6753,20 @@ (define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarc
 	    (match_dup 1)
 	    (const_int 3)))]
   "TARGET_AVX512FP16"
-  "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
+}
   [(set_attr "type" "ssemul")
-   (set_attr "mode" "V8HF")])
+   (set_attr "mode" "V8HF")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask_scalarc_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -15207,10 +15229,21 @@ (define_insn "avx512dq_mul<mode>3<mask_name>"
 	  (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))]
   "TARGET_AVX512DQ && <mask_mode512bit_condition>
   && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
-  "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
+}
   [(set_attr "type" "sseimul")
    (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask3_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_expand "cond_mul<mode>"
   [(set (match_operand:VI4_AVX512VL 0 "register_operand")
@@ -24636,10 +24669,21 @@ (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
 	   (match_operand:<sseintvecmode> 2 "register_operand" "v")]
 	  UNSPEC_VPERMVAR))]
   "TARGET_AVX2 && <mask_mode512bit_condition>"
-  "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix" "<mask_prefix2>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask3_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "<avx512>_permvar<mode><mask_name>"
   [(set (match_operand:VI1_AVX512VL 0 "register_operand" "=v")
@@ -24873,11 +24917,20 @@ (define_insn "avx2_perm<mode>_1<mask_name>"
   mask |= INTVAL (operands[4]) << 4;
   mask |= INTVAL (operands[5]) << 6;
   operands[2] = GEN_INT (mask);
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "<mask_prefix2>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask6_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_expand "avx512f_perm<mode>"
   [(match_operand:V8FI 0 "register_operand")
@@ -24944,11 +24997,20 @@ (define_insn "avx512f_perm<mode>_1<mask_name>"
   mask |= INTVAL (operands[4]) << 4;
   mask |= INTVAL (operands[5]) << 6;
   operands[2] = GEN_INT (mask);
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
   return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "<mask_prefix2>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask10_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx2_permv2ti"
   [(set (match_operand:V4DI 0 "register_operand" "=x")
@@ -26843,10 +26905,21 @@ (define_insn "avx512dq_rangep<mode><mask_name><round_saeonly_name>"
 	   (match_operand:SI 3 "const_0_to_15_operand")]
 	  UNSPEC_RANGE))]
   "TARGET_AVX512DQ && <round_saeonly_mode512bit_condition>"
-  "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
+}
   [(set_attr "type" "sse")
    (set_attr "prefix" "evex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask4_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>"
   [(set (match_operand:VF_128 0 "register_operand" "=v")
@@ -26859,10 +26932,21 @@ (define_insn "avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>
 	  (match_dup 1)
 	  (const_int 1)))]
   "TARGET_AVX512DQ"
-  "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+}
   [(set_attr "type" "sse")
    (set_attr "prefix" "evex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask_scalar4_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
@@ -26899,9 +26983,20 @@ (define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>"
 	   (match_operand:SI 2 "const_0_to_15_operand")]
 	  UNSPEC_GETMANT))]
   "TARGET_AVX512F"
-  "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+}
   [(set_attr "prefix" "evex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "!MEM_P (operands[1]) || <mask3_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>"
   [(set (match_operand:VFH_128 0 "register_operand" "=v")
@@ -26914,9 +27009,20 @@ (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name
 	  (match_dup 1)
 	  (const_int 1)))]
    "TARGET_AVX512F"
-   "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+{
+  if (TARGET_DEST_FALSE_DEPENDENCY
+      && get_attr_dest_false_dep (insn) ==
+	 DEST_FALSE_DEP_TRUE)
+    output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+  return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+}
    [(set_attr "prefix" "evex")
-   (set_attr "mode" "<ssescalarmode>")])
+   (set_attr "mode" "<ssescalarmode>")
+   (set (attr "dest_false_dep")
+	(if_then_else
+	  (match_test "<mask_scalar4_dest_false_dep_attr>")
+	  (const_string "false")
+	  (const_string "true")))])
 
 ;; The correct representation for this is absolutely enormous, and
 ;; surely not generally useful.
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 21d445cc46c..802a8715b01 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -71,6 +71,11 @@ (define_subst_attr "bcst_mask_prefix3" "mask" "orig,maybe_evex" "evex,evex")
 (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex")
 (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex")
 (define_subst_attr "mask_expand_op3" "mask" "3" "5")
+(define_subst_attr "mask3_dest_false_dep_attr" "mask" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask4_dest_false_dep_attr" "mask" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask6_dest_false_dep_attr" "mask" "0" "operands[6] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask10_dest_false_dep_attr" "mask" "0" "operands[10] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "maskc_dest_false_dep_attr" "maskc" "0" "operands[3] != CONST0_RTX(<MODE>mode)")
 
 (define_subst "mask"
   [(set (match_operand:SUBST_V 0)
@@ -337,6 +342,8 @@ (define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3")
 (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
 (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
 (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
+(define_subst_attr "mask_scalar4_dest_false_dep_attr" "mask_scalar" "0" "operands[4] != CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask_scalarc_dest_false_dep_attr" "mask_scalarc" "0" "operands[3] != CONST0_RTX(V8HFmode)")
 
 (define_subst "mask_scalar"
   [(set (match_operand:SUBST_V 0)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 0d3fd078068..1b42c96fc38 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -79,6 +79,11 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
 	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
 	  | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
 
+/* X86_TUNE_DEST_FALSE_DEPENDENCY: This knob inserts zero-idiom before
+   several insns to break false dependency on the dest register.  */
+DEF_TUNE (X86_TUNE_DEST_FALSE_DEPENDENCY,
+	  "dest_false_dependency", m_SAPPHIRERAPIDS | m_ALDERLAKE)
+
 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
    are resolved on SSE register parts instead of whole registers, so we may
    maintain just lower part of scalar values in proper format leaving the
diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
new file mode 100644
index 00000000000..e138920ce18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dependency.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -mtune-ctrl=dest_false_dependency -O2" } */
+
+
+#include <immintrin.h>
+
+extern __m256i i1, i2, i3, i4;
+extern __m256d d1, d2;
+extern __m256 f1, f2;
+
+void vperm_test (void)
+{
+  i3 = _mm256_permutevar8x32_epi32 (i1, i2);
+  i4 = _mm256_permute4x64_epi64 (i1, 12);
+  d2 = _mm256_permute4x64_pd (d1, 12);
+  f2 = _mm256_permutevar8x32_ps (f1, i2);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 4 } } */
+/* { dg-final { scan-assembler-times "vpermd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermq" 1 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermps" 1 } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
new file mode 100644
index 00000000000..2feb58f2cd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dependency.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512dq -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m512i i1;
+extern __m256i i2;
+extern __m128i i3;
+extern __m512d d1;
+extern __m256d d2;
+extern __m128d d3;
+extern __m512 f1;
+extern __m256 f2;
+extern __m128 f3;
+
+__mmask32 m32;
+__mmask16 m16;
+__mmask8 m8;
+
+void mullo_test (void)
+{
+  i1 = _mm512_mullo_epi64 (i1, i1);
+  i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1);
+  i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1);
+  i2 = _mm256_mullo_epi64 (i2, i2);
+  i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2);
+  i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2);
+  i3 = _mm_mullo_epi64 (i3, i3);
+  i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3);
+  i3 = _mm_maskz_mullo_epi64 (m8, i3, i3);
+}
+
+void range_test (void)
+{
+  d1 = _mm512_range_pd (d1, d1, 15);
+  d1 = _mm512_range_round_pd (d1, d1, 15, 8);
+  d1 = _mm512_mask_range_pd (d1, m8, d1, d1, 15);
+  d1 = _mm512_mask_range_round_pd (d1, m8, d1, d1, 15, 8);
+  d1 = _mm512_maskz_range_pd (m8, d1, d1, 15);
+  d1 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8);
+  d2 = _mm256_range_pd (d2, d2, 15);
+  d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15);
+  d2 = _mm256_maskz_range_pd (m8, d2, d2, 15);
+  d3 = _mm_range_pd (d3, d3, 15);
+  d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15);
+  d3 = _mm_maskz_range_pd (m8, d3, d3, 15);
+  d3 = _mm_range_sd (d3, d3, 15);
+  d3 = _mm_mask_range_sd (d3, m8, d3, d3, 15);
+  d3 = _mm_maskz_range_sd (m8, d3, d3, 15);
+
+  f1 = _mm512_range_ps (f1, f1, 15);
+  f1 = _mm512_range_round_ps (f1, f1, 15, 8);
+  f1 = _mm512_mask_range_ps (f1, m16, f1, f1, 15);
+  f1 = _mm512_mask_range_round_ps (f1, m16, f1, f1, 15, 8);
+  f1 = _mm512_maskz_range_ps (m16, f1, f1, 15);
+  f1 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8);
+  f2 = _mm256_range_ps (f2, f2, 15);
+  f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15);
+  f2 = _mm256_maskz_range_ps (m8, f2, f2, 15);
+  f3 = _mm_range_ps (f3, f3, 15);
+  f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15);
+  f3 = _mm_maskz_range_ps (m8, f3, f3, 15);
+  f3 = _mm_range_ss (f3, f3, 15);
+  f3 = _mm_mask_range_ss (f3, m8, f3, f3, 15);
+  f3 = _mm_maskz_range_ss (m8, f3, f3, 15);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 26 } } */
+/* { dg-final { scan-assembler-times "vpmullq" 9 } } */
+/* { dg-final { scan-assembler-times "vrangepd" 12 } } */
+/* { dg-final { scan-assembler-times "vrangesd" 3 } } */
+/* { dg-final { scan-assembler-times "vrangeps" 12 } } */
+/* { dg-final { scan-assembler-times "vrangess" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
new file mode 100644
index 00000000000..9650839970e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dependency.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m512i i1;
+extern __m512d d1, *pd1;
+extern __m128d d2;
+extern __m512 f1, *pf1;
+extern __m128 f2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void vperm_test (void)
+{
+  d1 = _mm512_permutex_pd (d1, 12);
+  d1 = _mm512_mask_permutex_pd (d1, m8, d1, 12);
+  d1 = _mm512_maskz_permutex_pd (m8, d1, 12);
+  d1 = _mm512_permutexvar_pd (i1, d1);
+  d1 = _mm512_mask_permutexvar_pd (d1, m8, i1, d1);
+  d1 = _mm512_maskz_permutexvar_pd (m8, i1, d1);
+
+  f1 = _mm512_permutexvar_ps (i1, f1);
+  f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1);
+  f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1);
+
+  i1 = _mm512_permutexvar_epi64 (i1, i1);
+  i1 = _mm512_mask_permutexvar_epi64 (i1, m8, i1, i1);
+  i1 = _mm512_maskz_permutexvar_epi64 (m8, i1, i1);
+  i1 = _mm512_permutex_epi64 (i1, 12);
+  i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12);
+  i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12);
+
+  i1 = _mm512_permutexvar_epi32 (i1, i1);
+  i1 = _mm512_mask_permutexvar_epi32 (i1, m16, i1, i1);
+  i1 = _mm512_maskz_permutexvar_epi32 (m16, i1, i1);
+} 
+
+void getmant_test (void)
+{
+  d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  d1 = _mm512_getmant_round_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src, 8);
+  d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				     _MM_MANT_SIGN_src, 8);
+  d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				      _MM_MANT_SIGN_src, 8);
+  f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src, 8);
+  f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
+				     _MM_MANT_SIGN_src, 8);
+  f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
+				      _MM_MANT_SIGN_src, 8);
+
+  d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src, 8);
+  d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+				  _MM_MANT_SIGN_src, 8);
+  d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+  d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+				   _MM_MANT_SIGN_src, 8);
+  f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src, 8);
+  f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+				  _MM_MANT_SIGN_src, 8);
+  f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+  f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+				   _MM_MANT_SIGN_src, 8);
+
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 24 } } */
+/* { dg-final { scan-assembler-times "vpermd" 3 } } */
+/* { dg-final { scan-assembler-times "vpermq" 6 } } */
+/* { dg-final { scan-assembler-times "vpermps" 3 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
new file mode 100644
index 00000000000..793bb66201b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dependency.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m512h h1;
+extern __m256h h2;
+extern __m128h h3;
+
+__mmask32 m32;
+__mmask16 m16;
+__mmask8 m8;
+
+void complex_mul_test (void)
+{
+  h1 = _mm512_fmul_pch (h1, h1);
+  h1 = _mm512_fmul_round_pch (h1, h1, 8);
+  h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1);
+  h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8);
+  h1 = _mm512_maskz_fmul_pch (m32, h1, h1);
+  h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11);
+
+  h3 = _mm_fmul_sch (h3, h3);
+  h3 = _mm_fmul_round_sch (h3, h3, 8);
+  h3 = _mm_mask_fmul_sch (h3, m8, h3, h3);
+  h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8);
+  h3 = _mm_maskz_fmul_sch (m8, h3, h3);
+  h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11);
+}
+
+void vgetmant_test (void)
+{
+  h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+}    
+
+/* { dg-final { scan-assembler-times "vxorps" 10 } } */
+/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
+/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
new file mode 100644
index 00000000000..09658905d2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dependency.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+#include <immintrin.h>
+
+extern __m256h h1;
+extern __m128h h2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void complex_mul_test (void)
+{
+  h1 = _mm256_fmul_pch (h1, h1);
+  h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1);
+  h1 = _mm256_maskz_fmul_pch (m16, h1, h1);
+  h2 = _mm_fmul_pch (h2, h2);
+  h2 = _mm_mask_fmul_pch (h2, m16, h2, h2);
+  h2 = _mm_maskz_fmul_pch (m16, h2, h2);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 4 } } */
+/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
new file mode 100644
index 00000000000..92717a99837
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dependency.c
@@ -0,0 +1,76 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mavx512vl -mtune-ctrl=dest_false_dependency -O2" } */
+
+
+#include <immintrin.h>
+
+extern __m256i i1;
+extern __m256d d1, *pd1;
+extern __m128d d2, *pd2;
+extern __m256 f1, *pf1;
+extern __m128 f2, *pf2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void vperm_test (void)
+{
+  d1 = _mm256_permutex_pd (d1, 12);
+  d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12);
+  d1 = _mm256_maskz_permutex_pd (m8, d1, 12);
+  d1 = _mm256_permutexvar_pd (i1, d1);
+  d1 = _mm256_mask_permutexvar_pd (d1, m8, i1, d1);
+  d1 = _mm256_maskz_permutexvar_pd (m8, i1, d1);
+
+  f1 = _mm256_permutexvar_ps (i1, f1);
+  f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1);
+  f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1);
+
+  i1 = _mm256_permutexvar_epi64 (i1, i1);
+  i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1);
+  i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1);
+  i1 = _mm256_permutex_epi64 (i1, 12);
+  i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12);
+  i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12);
+
+  i1 = _mm256_permutexvar_epi32 (i1, i1);
+  i1 = _mm256_mask_permutexvar_epi32 (i1, m8, i1, i1);
+  i1 = _mm256_maskz_permutexvar_epi32 (m8, i1, i1);
+} 
+
+void getmant_test (void)
+{
+  d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+  f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+			  _MM_MANT_SIGN_src);
+  f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5,
+			       _MM_MANT_SIGN_src);
+  f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5,
+				_MM_MANT_SIGN_src);
+  f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5,
+		       _MM_MANT_SIGN_src);
+  f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5,
+			    _MM_MANT_SIGN_src);
+  f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5,
+			     _MM_MANT_SIGN_src);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 20 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
+/* { dg-final { scan-assembler-times "vpermps" 3 } } */
+/* { dg-final { scan-assembler-times "vpermq" 6 } } */
+/* { dg-final { scan-assembler-times "vpermd" 3 } } */
+/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
+
-- 
2.18.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-01-19  1:47 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-13  7:28 [PATCH] [i386] GLC tuning: Break false dependency for dest register Hongyu Wang
2022-01-13  7:41 ` Uros Bizjak
2022-01-14  5:38   ` Hongyu Wang
2022-01-14  6:03     ` Hongyu Wang
2022-01-14  8:37       ` Uros Bizjak
2022-01-14 13:44         ` Hongyu Wang
2022-01-14 15:49           ` Uros Bizjak
2022-01-15 16:39             ` Hongyu Wang
2022-01-15 16:43               ` Uros Bizjak
2022-01-16  4:22                 ` Hongtao Liu
2022-01-19  0:00                   ` [PATCH] i386: Fix GLC tuning with -masm=intel [PR104104] Jakub Jelinek
2022-01-19  1:01                     ` Wang, Hongyu
2022-01-19  1:09                     ` Hongtao Liu
2022-01-19  1:40                       ` [PATCH] i386: Fix *aes<aeswideklvariant>u8 Jakub Jelinek
2022-01-19  1:47                         ` Hongtao Liu
2022-01-14  8:17     ` [PATCH] [i386] GLC tuning: Break false dependency for dest register Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).