public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-1253] Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special
@ 2021-06-07  2:25 hongtao Liu
  0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2021-06-07  2:25 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:9a90b311f22956addaf4f5f9bdb3592afd45083f

commit r12-1253-g9a90b311f22956addaf4f5f9bdb3592afd45083f
Author: liuhongt <hongtao.liu@intel.com>
Date:   Tue Jun 1 09:09:44 2021 +0800

    Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special vzeroupper ABI.
    
    When __builtin_ia32_vzeroupper is called explicitly, the corresponding
    vzeroupper pattern does not carry any CLOBBERS or SETs before LRA,
    which leads to incorrect optimization in pass_reload. In order to
    solve this problem, this patch refine instructions as call_insns in
    which the call has a special vzeroupper ABI.
    
    gcc/ChangeLog:
    
            PR target/82735
            * config/i386/i386-expand.c (ix86_expand_builtin): Remove
            assignment of cfun->machine->has_explicit_vzeroupper.
            * config/i386/i386-features.c
            (ix86_add_reg_usage_to_vzerouppers): Delete.
            (ix86_add_reg_usage_to_vzeroupper): Ditto.
            (rest_of_handle_insert_vzeroupper): Remove
            ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end
            of the function.
            (gate): Remove cfun->machine->has_explicit_vzeroupper.
            * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper):
            Declared.
            * config/i386/i386.c (ix86_insn_callee_abi): New function.
            (ix86_initialize_callee_abi): Ditto.
            (ix86_expand_avx_vzeroupper): Ditto.
            (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper
            ABI.
            (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi.
            (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper
            directly.
            * config/i386/i386.h (struct GTY(()) machine_function): Delete
            has_explicit_vzeroupper.
            * config/i386/i386.md (enum unspec): New member
            UNSPEC_CALLEE_ABI.
            (ABI_DEFAULT,ABI_VZEROUPPER,ABI_UNKNOWN): New
            define_constants for insn callee abi index.
            * config/i386/predicates.md (vzeroupper_pattern): Adjust.
            * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted.
            (avx_vzeroupper): Call ix86_expand_avx_vzeroupper.
            (*avx_vzeroupper): Rename to ..
            (avx_vzeroupper_callee_abi): .. this, and adjust pattern as
            call_insn which has a special vzeroupper ABI.
            (*avx_vzeroupper_1): Deleted.
    
    gcc/testsuite/ChangeLog:
    
            PR target/82735
            * gcc.target/i386/pr82735-1.c: New test.
            * gcc.target/i386/pr82735-2.c: New test.
            * gcc.target/i386/pr82735-3.c: New test.
            * gcc.target/i386/pr82735-4.c: New test.
            * gcc.target/i386/pr82735-5.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.c             |  4 --
 gcc/config/i386/i386-features.c           | 99 ++++---------------------------
 gcc/config/i386/i386-protos.h             |  1 +
 gcc/config/i386/i386.c                    | 55 ++++++++++++++++-
 gcc/config/i386/i386.h                    |  4 --
 gcc/config/i386/i386.md                   | 10 ++++
 gcc/config/i386/predicates.md             |  5 +-
 gcc/config/i386/sse.md                    | 59 +++++-------------
 gcc/testsuite/gcc.target/i386/pr82735-1.c | 29 +++++++++
 gcc/testsuite/gcc.target/i386/pr82735-2.c | 22 +++++++
 gcc/testsuite/gcc.target/i386/pr82735-3.c |  5 ++
 gcc/testsuite/gcc.target/i386/pr82735-4.c | 48 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr82735-5.c | 54 +++++++++++++++++
 13 files changed, 252 insertions(+), 143 deletions(-)

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 804cb596867..fb0676f1158 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -13310,10 +13310,6 @@ rdseed_step:
 
       return 0;
 
-    case IX86_BUILTIN_VZEROUPPER:
-      cfun->machine->has_explicit_vzeroupper = true;
-      break;
-
     default:
       break;
     }
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 77783a154b6..a25769ae478 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p)
   return 0;
 }
 
-/* Modify the vzeroupper pattern in INSN so that it describes the effect
-   that the instruction has on the SSE registers.  LIVE_REGS are the set
-   of registers that are live across the instruction.
-
-   For a live register R we use:
-
-     (set (reg:V2DF R) (reg:V2DF R))
-
-   which preserves the low 128 bits but clobbers the upper bits.  */
-
-static void
-ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs)
-{
-  rtx pattern = PATTERN (insn);
-  unsigned int nregs = TARGET_64BIT ? 16 : 8;
-  unsigned int npats = nregs;
-  for (unsigned int i = 0; i < nregs; ++i)
-    {
-      unsigned int regno = GET_SSE_REGNO (i);
-      if (!bitmap_bit_p (live_regs, regno))
-	npats--;
-    }
-  if (npats == 0)
-    return;
-  rtvec vec = rtvec_alloc (npats + 1);
-  RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0);
-  for (unsigned int i = 0, j = 0; i < nregs; ++i)
-    {
-      unsigned int regno = GET_SSE_REGNO (i);
-      if (!bitmap_bit_p (live_regs, regno))
-	continue;
-      rtx reg = gen_rtx_REG (V2DImode, regno);
-      ++j;
-      RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg);
-    }
-  XVEC (pattern, 0) = vec;
-  INSN_CODE (insn) = -1;
-  df_insn_rescan (insn);
-}
-
-/* Walk the vzeroupper instructions in the function and annotate them
-   with the effect that they have on the SSE registers.  */
-
-static void
-ix86_add_reg_usage_to_vzerouppers (void)
-{
-  basic_block bb;
-  rtx_insn *insn;
-  auto_bitmap live_regs;
-
-  df_analyze ();
-  FOR_EACH_BB_FN (bb, cfun)
-    {
-      bitmap_copy (live_regs, df_get_live_out (bb));
-      df_simulate_initialize_backwards (bb, live_regs);
-      FOR_BB_INSNS_REVERSE (bb, insn)
-	{
-	  if (!NONDEBUG_INSN_P (insn))
-	    continue;
-	  if (vzeroupper_pattern (PATTERN (insn), VOIDmode))
-	    ix86_add_reg_usage_to_vzeroupper (insn, live_regs);
-	  df_simulate_one_insn_backwards (bb, insn, live_regs);
-	}
-    }
-}
-
 static unsigned int
 rest_of_handle_insert_vzeroupper (void)
 {
-  if (TARGET_VZEROUPPER
-      && flag_expensive_optimizations
-      && !optimize_size)
-    {
-      /* vzeroupper instructions are inserted immediately after reload to
-	 account for possible spills from 256bit or 512bit registers.  The pass
-	 reuses mode switching infrastructure by re-running mode insertion
-	 pass, so disable entities that have already been processed.  */
-      for (int i = 0; i < MAX_386_ENTITIES; i++)
-	ix86_optimize_mode_switching[i] = 0;
+  /* vzeroupper instructions are inserted immediately after reload to
+     account for possible spills from 256bit or 512bit registers.  The pass
+     reuses mode switching infrastructure by re-running mode insertion
+     pass, so disable entities that have already been processed.  */
+  for (int i = 0; i < MAX_386_ENTITIES; i++)
+    ix86_optimize_mode_switching[i] = 0;
 
-      ix86_optimize_mode_switching[AVX_U128] = 1;
+  ix86_optimize_mode_switching[AVX_U128] = 1;
 
-      /* Call optimize_mode_switching.  */
-      g->get_passes ()->execute_pass_mode_switching ();
-    }
-  ix86_add_reg_usage_to_vzerouppers ();
+  /* Call optimize_mode_switching.  */
+  g->get_passes ()->execute_pass_mode_switching ();
+
+  df_analyze ();
   return 0;
 }
 
@@ -1882,11 +1812,8 @@ public:
   /* opt_pass methods: */
   virtual bool gate (function *)
     {
-      return TARGET_AVX
-	     && ((TARGET_VZEROUPPER
-		  && flag_expensive_optimizations
-		  && !optimize_size)
-		 || cfun->machine->has_explicit_vzeroupper);
+      return TARGET_AVX && TARGET_VZEROUPPER
+	&& flag_expensive_optimizations && !optimize_size;
     }
 
   virtual unsigned int execute (function *)
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 7782cf1163f..e6ac9390777 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -216,6 +216,7 @@ extern rtx ix86_split_stack_guard (void);
 extern void ix86_move_vector_high_sse_to_mmx (rtx);
 extern void ix86_split_mmx_pack (rtx[], enum rtx_code);
 extern void ix86_split_mmx_punpck (rtx[], bool);
+extern void ix86_expand_avx_vzeroupper (void);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 04649b42122..b0d19a61a76 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -14426,7 +14426,7 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
       break;
     case AVX_U128:
       if (mode == AVX_U128_CLEAN)
-	emit_insn (gen_avx_vzeroupper ());
+	ix86_expand_avx_vzeroupper ();
       break;
     case I387_ROUNDEVEN:
     case I387_TRUNC:
@@ -19497,15 +19497,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
   return false;
 }
 
+/* Implement TARGET_INSN_CALLEE_ABI.  */
+
+const predefined_function_abi &
+ix86_insn_callee_abi (const rtx_insn *insn)
+{
+  unsigned int abi_id = 0;
+  rtx pat = PATTERN (insn);
+  if (vzeroupper_pattern (pat, VOIDmode))
+    abi_id = ABI_VZEROUPPER;
+
+  return function_abis[abi_id];
+}
+
+/* Initialize function_abis with corresponding abi_id,
+   currently only handle vzeroupper.  */
+void
+ix86_initialize_callee_abi (unsigned int abi_id)
+{
+  gcc_assert (abi_id == ABI_VZEROUPPER);
+  predefined_function_abi &vzeroupper_abi = function_abis[abi_id];
+  if (!vzeroupper_abi.initialized_p ())
+    {
+      HARD_REG_SET full_reg_clobbers;
+      CLEAR_HARD_REG_SET (full_reg_clobbers);
+      vzeroupper_abi.initialize (ABI_VZEROUPPER, full_reg_clobbers);
+    }
+}
+
+void
+ix86_expand_avx_vzeroupper (void)
+{
+  /* Initialize vzeroupper_abi here.  */
+  ix86_initialize_callee_abi (ABI_VZEROUPPER);
+  rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ());
+  /* Return false for non-local goto in can_nonlocal_goto.  */
+  make_reg_eh_region_note (insn, 0, INT_MIN);
+  /* Flag used for call_insn indicates it's a fake call.  */
+  RTX_FLAG (insn, used) = 1;
+}
+
+
 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The only ABI that
    saves SSE registers across calls is Win64 (thus no need to check the
    current ABI here), and with AVX enabled Win64 only guarantees that
    the low 16 bytes are saved.  */
 
 static bool
-ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno,
+ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno,
 				     machine_mode mode)
 {
+  /* Special ABI for vzeroupper which only clobber higher part of sse regs.  */
+  if (abi_id == ABI_VZEROUPPER)
+      return (GET_MODE_SIZE (mode) > 16
+	      && ((TARGET_64BIT
+		   && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG)))
+		  || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))));
+
   return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
 }
 
@@ -23926,6 +23974,9 @@ ix86_run_selftests (void)
 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
   ix86_hard_regno_call_part_clobbered
 
+#undef TARGET_INSN_CALLEE_ABI
+#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi
+
 #undef TARGET_CAN_CHANGE_MODE_CLASS
 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 53d503fc6e0..919d0b2418a 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2659,10 +2659,6 @@ struct GTY(()) machine_function {
   /* True if the function needs a stack frame.  */
   BOOL_BITFIELD stack_frame_required : 1;
 
-  /* True if __builtin_ia32_vzeroupper () has been expanded in current
-     function.  */
-  BOOL_BITFIELD has_explicit_vzeroupper : 1;
-
   /* True if we should act silently, rather than raise an error for
      invalid calls.  */
   BOOL_BITFIELD silent_p : 1;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index f0bb7986d6c..5ff49ec2f1c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -191,6 +191,10 @@
   ;; For MOVDIRI and MOVDIR64B support
   UNSPEC_MOVDIRI
   UNSPEC_MOVDIR64B
+
+  ;; For insn_callee_abi:
+  UNSPEC_CALLEE_ABI
+
 ])
 
 (define_c_enum "unspecv" [
@@ -447,6 +451,12 @@
    (FIRST_PSEUDO_REG		76)
   ])
 
+;; Insn callee abi index.
+(define_constants
+  [(ABI_DEFAULT		0)
+   (ABI_VZEROUPPER	1)
+   (ABI_UNKNOWN		2)])
+
 ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
 ;; from i386.c.
 
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index d2f5f15d971..3dd134e7f22 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1596,8 +1596,9 @@
 ;; return true if OP is a vzeroupper pattern.
 (define_predicate "vzeroupper_pattern"
   (and (match_code "parallel")
-       (match_code "unspec_volatile" "a")
-       (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER")))
+       (match_code "unspec" "b")
+       (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI")
+       (match_test "INTVAL (XVECEXP (XVECEXP (op, 0, 1), 0, 0)) == ABI_VZEROUPPER")))
 
 ;; Return true if OP is an addsub vec_merge operation
 (define_predicate "addsub_vm_operator"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e4248e554eb..9fc1176107b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -205,7 +205,6 @@
   UNSPECV_MONITOR
   UNSPECV_MWAIT
   UNSPECV_VZEROALL
-  UNSPECV_VZEROUPPER
 
   ;; For KEYLOCKER
   UNSPECV_LOADIWKEY
@@ -20872,14 +20871,22 @@
 ;; if the upper 128bits are unused.  Initially we expand the instructions
 ;; as though they had no effect on the SSE registers, but later add SETs and
 ;; CLOBBERs to the PARALLEL to model the real effect.
+
 (define_expand "avx_vzeroupper"
-  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
-  "TARGET_AVX")
+  [(parallel [(call (mem:QI (const_int 0))
+		    (const_int 0))
+	     (unspec [(const_int ABI_VZEROUPPER)] UNSPEC_CALLEE_ABI)])]
+  "TARGET_AVX"
+{
+  ix86_expand_avx_vzeroupper ();
+  DONE;
+})
 
-(define_insn "*avx_vzeroupper"
-  [(match_parallel 0 "vzeroupper_pattern"
-     [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
-  "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1"
+(define_insn "avx_vzeroupper_callee_abi"
+  [(call (mem:QI (const_int 0))
+	 (const_int 0))
+    (unspec [(const_int ABI_VZEROUPPER)] UNSPEC_CALLEE_ABI)]
+  "TARGET_AVX"
   "vzeroupper"
   [(set_attr "type" "sse")
    (set_attr "modrm" "0")
@@ -20888,44 +20895,6 @@
    (set_attr "btver2_decode" "vector")
    (set_attr "mode" "OI")])
 
-(define_insn_and_split "*avx_vzeroupper_1"
-  [(match_parallel 0 "vzeroupper_pattern"
-     [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
-  "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1"
-  "#"
-  "&& epilogue_completed"
-  [(match_dup 0)]
-{
-  /* For IPA-RA purposes, make it clear the instruction clobbers
-     even XMM registers not mentioned explicitly in the pattern.  */
-  unsigned int nregs = TARGET_64BIT ? 16 : 8;
-  unsigned int npats = XVECLEN (operands[0], 0);
-  rtvec vec = rtvec_alloc (nregs + 1);
-  RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0);
-  for (unsigned int i = 0, j = 1; i < nregs; ++i)
-    {
-      unsigned int regno = GET_SSE_REGNO (i);
-      if (j < npats
-	  && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno)
-	{
-	  RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j);
-	  j++;
-	}
-      else
-	{
-	  rtx reg = gen_rtx_REG (V2DImode, regno);
-	  RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg);
-	}
-    }
-  operands[0] = gen_rtx_PARALLEL (VOIDmode, vec);
-}
-  [(set_attr "type" "sse")
-   (set_attr "modrm" "0")
-   (set_attr "memory" "none")
-   (set_attr "prefix" "vex")
-   (set_attr "btver2_decode" "vector")
-   (set_attr "mode" "OI")])
-
 (define_mode_attr pbroadcast_evex_isa
   [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw")
    (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw")
diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c b/gcc/testsuite/gcc.target/i386/pr82735-1.c
new file mode 100644
index 00000000000..1a63b9ae9c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+void
+__attribute__ ((noipa))
+mtest(char *dest)
+{
+  __m256i ymm1 = _mm256_set1_epi8((char)0x1);
+  _mm256_storeu_si256((__m256i *)(dest + 32), ymm1);
+  _mm256_zeroupper();
+  __m256i ymm2 = _mm256_set1_epi8((char)0x1);
+  _mm256_storeu_si256((__m256i *)dest, ymm2);
+}
+
+void
+avx_test ()
+{
+  char buf[64];
+  for (int i = 0; i != 64; i++)
+    buf[i] = 2;
+  mtest (buf);
+
+  for (int i = 0; i < 32; ++i)
+    if (buf[i] != 1)
+      __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c b/gcc/testsuite/gcc.target/i386/pr82735-2.c
new file mode 100644
index 00000000000..ac9d006f794
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx -O2" } */
+
+#include <immintrin.h>
+
+void test(char *dest)
+{
+  /* xmm1 can be propagated to xmm2 by CSE.  */
+  __m128i xmm1 = _mm_set_epi8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+			      0x9, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16);
+  _mm_storeu_si128((__m128i *)(dest + 32), xmm1);
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  __m128i xmm2 = xmm1;
+  _mm_storeu_si128((__m128i *)dest, xmm2);
+}
+
+/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */
+/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, %xmm[0-9]} 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c b/gcc/testsuite/gcc.target/i386/pr82735-3.c
new file mode 100644
index 00000000000..e3f801e6924
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx -O2 -mabi=ms" } */
+/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */
+
+#include "pr82735-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c b/gcc/testsuite/gcc.target/i386/pr82735-4.c
new file mode 100644
index 00000000000..78c0a6cb2c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { ! ia32 } } }  */
+/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */
+/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */
+/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */
+
+#include <immintrin.h>
+
+void test(char *dest)
+{
+  __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+  asm volatile ("vmovdqa\t%%ymm0, %0\n\t"
+		"vmovdqa\t%%ymm0, %1\n\t"
+		"vmovdqa\t%%ymm0, %2\n\t"
+		"vmovdqa\t%%ymm0, %3\n\t"
+		"vmovdqa\t%%ymm0, %4\n\t"
+		"vmovdqa\t%%ymm0, %5\n\t"
+		"vmovdqa\t%%ymm0, %6\n\t"
+		"vmovdqa\t%%ymm0, %7\n\t"
+		"vmovdqa\t%%ymm0, %8\n\t"
+		"vmovdqa\t%%ymm0, %9\n\t"
+		"vmovdqa\t%%ymm0, %10\n\t"
+		"vmovdqa\t%%ymm0, %11\n\t"
+		"vmovdqa\t%%ymm0, %12\n\t"
+		"vmovdqa\t%%ymm0, %13\n\t"
+		"vmovdqa\t%%ymm0, %14\n\t"
+		"vmovdqa\t%%ymm0, %15\n\t"
+		: "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5),
+		  "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10),
+		  "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15),
+		  "=v"(ymm0)
+		::);
+  _mm256_zeroupper();
+  _mm256_storeu_si256((__m256i *)dest, ymm1);
+  _mm256_storeu_si256((__m256i *)(dest + 32), ymm2);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c b/gcc/testsuite/gcc.target/i386/pr82735-5.c
new file mode 100644
index 00000000000..2a58cbe52d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c
@@ -0,0 +1,54 @@
+/* { dg-do compile { target { ! ia32 } } }  */
+/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */
+/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */
+/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */
+
+#include <immintrin.h>
+
+void test(char *dest)
+{
+  __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+  asm volatile ("vmovdqa\t%%ymm0, %0\n\t"
+		"vmovdqa\t%%ymm0, %1\n\t"
+		"vmovdqa\t%%ymm0, %2\n\t"
+		"vmovdqa\t%%ymm0, %3\n\t"
+		"vmovdqa\t%%ymm0, %4\n\t"
+		"vmovdqa\t%%ymm0, %5\n\t"
+		"vmovdqa\t%%ymm0, %6\n\t"
+		"vmovdqa\t%%ymm0, %7\n\t"
+		"vmovdqa\t%%ymm0, %8\n\t"
+		"vmovdqa\t%%ymm0, %9\n\t"
+		"vmovdqa\t%%ymm0, %10\n\t"
+		"vmovdqa\t%%ymm0, %11\n\t"
+		"vmovdqa\t%%ymm0, %12\n\t"
+		"vmovdqa\t%%ymm0, %13\n\t"
+		"vmovdqa\t%%ymm0, %14\n\t"
+		"vmovdqa\t%%ymm0, %15\n\t"
+		: "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5),
+		  "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10),
+		  "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15),
+		  "=v"(ymm0)
+		::);
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_zeroupper();
+  _mm256_storeu_si256((__m256i *)dest, ymm1);
+  _mm256_storeu_si256((__m256i *)(dest + 32), ymm2);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14);
+  _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15);
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-06-07  2:25 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-07  2:25 [gcc r12-1253] Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).