From c898d16891ed4e9cf104ee36914399da0bff4a23 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 12 May 2021 14:20:54 +0800 Subject: [PATCH] [i386] Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special vzeroupper ABI. When __builtin_ia32_vzeroupper is called explicitly, the corresponding vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, which leads to incorrect optimization in pass_reload. In order to solve this problem, this patch refine instructions as call_insns in which the call has a special vzeroupper ABI. gcc/ChangeLog: PR target/82735 * config/i386/i386-expand.c (ix86_expand_builtin): Remove assignment of cfun->machine->has_explicit_vzeroupper. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzerouppers): Delete. (ix86_add_reg_usage_to_vzeroupper): Ditto. (rest_of_handle_insert_vzeroupper): Remove ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end of the function. (gate): Remove cfun->machine->has_explicit_vzeroupper. * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): Declared. * config/i386/i386.c (ix86_insn_callee_abi): New function. (ix86_initialize_callee_abi): Ditto. (ix86_expand_avx_vzeroupper): Ditto. (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper ABI. (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. * config/i386/i386.h (enum i386_insn_callee_abi_index): New. (struct GTY(()) machine_function): Delete has_explicit_vzeroupper. * config/i386/i386.md (enum unspec): New member UNSPEC_CALLEE_ABI. * config/i386/predicates.md (vzeroupper_pattern): Adjust. * config/i386/sse.md (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. (*avx_vzeroupper): Rename to .. (avx_vzeroupper_callee_abi): .. this, and adjust pattern as call_insn which has a special vzeroupper ABI. (*avx_vzeroupper_1): Deleted. * df-scan.c (df_get_call_refs): When call_insn is a fake call, it won't use stack pointer reg. * final.c (leaf_function_p): When call_insn is a fake call, it won't affect caller as a leaf function. * reg-stack.c (callee_clobbers_any_stack_reg): New. (subst_stack_regs): When call_insn doesn't clobber any stack reg, don't clear the arguments. * rtl.c (shallow_copy_rtx): Don't clear flag used when orig is a insn. * shrink-wrap.c (requires_stack_frame_p): No need for stack frame for a fake call. gcc/testsuite/ChangeLog: PR target/82735 * gcc.target/i386/pr82735-1.c: New test. * gcc.target/i386/pr82735-2.c: New test. * gcc.target/i386/pr82735-3.c: New test. * gcc.target/i386/pr82735-4.c: New test. * gcc.target/i386/pr82735-5.c: New test. --- gcc/config/i386/i386-expand.c | 4 - gcc/config/i386/i386-features.c | 99 +++-------------------- gcc/config/i386/i386-protos.h | 2 + gcc/config/i386/i386.c | 53 +++++++++++- gcc/config/i386/i386.h | 12 ++- gcc/config/i386/i386.md | 4 + gcc/config/i386/predicates.md | 5 +- gcc/config/i386/sse.md | 58 ++++--------- gcc/df-scan.c | 4 +- gcc/final.c | 3 +- gcc/reg-stack.c | 18 ++++- gcc/rtl.c | 6 +- gcc/shrink-wrap.c | 3 +- gcc/testsuite/gcc.target/i386/pr82735-1.c | 29 +++++++ gcc/testsuite/gcc.target/i386/pr82735-2.c | 22 +++++ gcc/testsuite/gcc.target/i386/pr82735-3.c | 5 ++ gcc/testsuite/gcc.target/i386/pr82735-4.c | 48 +++++++++++ gcc/testsuite/gcc.target/i386/pr82735-5.c | 54 +++++++++++++ 18 files changed, 282 insertions(+), 147 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-5.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9f3d41955a2..d25d59aa4e7 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13282,10 +13282,6 @@ rdseed_step: return 0; - case IX86_BUILTIN_VZEROUPPER: - cfun->machine->has_explicit_vzeroupper = true; - break; - default: break; } diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 77783a154b6..a25769ae478 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p) return 0; } -/* Modify the vzeroupper pattern in INSN so that it describes the effect - that the instruction has on the SSE registers. LIVE_REGS are the set - of registers that are live across the instruction. - - For a live register R we use: - - (set (reg:V2DF R) (reg:V2DF R)) - - which preserves the low 128 bits but clobbers the upper bits. */ - -static void -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) -{ - rtx pattern = PATTERN (insn); - unsigned int nregs = TARGET_64BIT ? 16 : 8; - unsigned int npats = nregs; - for (unsigned int i = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - npats--; - } - if (npats == 0) - return; - rtvec vec = rtvec_alloc (npats + 1); - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); - for (unsigned int i = 0, j = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - continue; - rtx reg = gen_rtx_REG (V2DImode, regno); - ++j; - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); - } - XVEC (pattern, 0) = vec; - INSN_CODE (insn) = -1; - df_insn_rescan (insn); -} - -/* Walk the vzeroupper instructions in the function and annotate them - with the effect that they have on the SSE registers. */ - -static void -ix86_add_reg_usage_to_vzerouppers (void) -{ - basic_block bb; - rtx_insn *insn; - auto_bitmap live_regs; - - df_analyze (); - FOR_EACH_BB_FN (bb, cfun) - { - bitmap_copy (live_regs, df_get_live_out (bb)); - df_simulate_initialize_backwards (bb, live_regs); - FOR_BB_INSNS_REVERSE (bb, insn) - { - if (!NONDEBUG_INSN_P (insn)) - continue; - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); - df_simulate_one_insn_backwards (bb, insn, live_regs); - } - } -} - static unsigned int rest_of_handle_insert_vzeroupper (void) { - if (TARGET_VZEROUPPER - && flag_expensive_optimizations - && !optimize_size) - { - /* vzeroupper instructions are inserted immediately after reload to - account for possible spills from 256bit or 512bit registers. The pass - reuses mode switching infrastructure by re-running mode insertion - pass, so disable entities that have already been processed. */ - for (int i = 0; i < MAX_386_ENTITIES; i++) - ix86_optimize_mode_switching[i] = 0; + /* vzeroupper instructions are inserted immediately after reload to + account for possible spills from 256bit or 512bit registers. The pass + reuses mode switching infrastructure by re-running mode insertion + pass, so disable entities that have already been processed. */ + for (int i = 0; i < MAX_386_ENTITIES; i++) + ix86_optimize_mode_switching[i] = 0; - ix86_optimize_mode_switching[AVX_U128] = 1; + ix86_optimize_mode_switching[AVX_U128] = 1; - /* Call optimize_mode_switching. */ - g->get_passes ()->execute_pass_mode_switching (); - } - ix86_add_reg_usage_to_vzerouppers (); + /* Call optimize_mode_switching. */ + g->get_passes ()->execute_pass_mode_switching (); + + df_analyze (); return 0; } @@ -1882,11 +1812,8 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { - return TARGET_AVX - && ((TARGET_VZEROUPPER - && flag_expensive_optimizations - && !optimize_size) - || cfun->machine->has_explicit_vzeroupper); + return TARGET_AVX && TARGET_VZEROUPPER + && flag_expensive_optimizations && !optimize_size; } virtual unsigned int execute (function *) diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 7782cf1163f..e81b9872c6c 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -51,6 +51,7 @@ extern void ix86_reset_previous_fndecl (void); extern bool ix86_using_red_zone (void); extern unsigned int ix86_regmode_natural_size (machine_mode); + #ifdef RTX_CODE extern int standard_80387_constant_p (rtx); extern const char *standard_80387_constant_opcode (rtx); @@ -216,6 +217,7 @@ extern rtx ix86_split_stack_guard (void); extern void ix86_move_vector_high_sse_to_mmx (rtx); extern void ix86_split_mmx_pack (rtx[], enum rtx_code); extern void ix86_split_mmx_punpck (rtx[], bool); +extern void ix86_expand_avx_vzeroupper (void); #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 743d8a25fe3..cecc07e7371 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19494,15 +19494,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) return false; } +/* Implement TARGET_INSN_CALLEE_ABI. */ + +const predefined_function_abi & +ix86_insn_callee_abi (const rtx_insn *insn) +{ + unsigned int abi_id = 0; + rtx pat = PATTERN (insn); + if (vzeroupper_pattern (pat, VOIDmode)) + abi_id = I386_VZEROUPPER; + + return function_abis[abi_id]; +} + +/* Initialize function_abis with corresponding abi_id, + currently only handle vzeroupper. */ +void +ix86_initialize_callee_abi (unsigned int abi_id) +{ + gcc_assert (abi_id == I386_VZEROUPPER); + predefined_function_abi &vzeroupper_abi = function_abis[abi_id]; + if (!vzeroupper_abi.initialized_p ()) + { + HARD_REG_SET full_reg_clobbers; + CLEAR_HARD_REG_SET (full_reg_clobbers); + vzeroupper_abi.initialize (I386_VZEROUPPER, full_reg_clobbers); + } +} + +void +ix86_expand_avx_vzeroupper (void) +{ + /* Initialize vzeroupper_abi here. */ + ix86_initialize_callee_abi (I386_VZEROUPPER); + rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ()); + /* Return false for non-local goto in can_nonlocal_goto. */ + make_reg_eh_region_note (insn, 0, INT_MIN); + /* Flag used for call_insn indicates it's a fake call. */ + RTX_FLAG (insn, used) = 1; +} + + /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that saves SSE registers across calls is Win64 (thus no need to check the current ABI here), and with AVX enabled Win64 only guarantees that the low 16 bytes are saved. */ static bool -ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno, +ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno, machine_mode mode) { + /* Special ABI for vzeroupper which only clobber higher part of sse regs. */ + if (abi_id == I386_VZEROUPPER) + return (GET_MODE_SIZE (mode) > 16 + && ((TARGET_64BIT + && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) + || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)))); + return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; } @@ -23916,6 +23964,9 @@ ix86_run_selftests (void) #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ ix86_hard_regno_call_part_clobbered +#undef TARGET_INSN_CALLEE_ABI +#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi + #undef TARGET_CAN_CHANGE_MODE_CLASS #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 53d503fc6e0..9d07769169d 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1177,6 +1177,14 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define KEEP_AGGREGATE_RETURN_POINTER 0 + +enum i386_insn_callee_abi_index +{ + I386_DEFAULT, /* Default function abi. */ + I386_VZEROUPPER, /* For vzeroupper. */ + I386_UNKNOWN +}; + /* Define the classes of registers for register constraints in the machine description. Also define ranges of constants. @@ -2659,10 +2667,6 @@ struct GTY(()) machine_function { /* True if the function needs a stack frame. */ BOOL_BITFIELD stack_frame_required : 1; - /* True if __builtin_ia32_vzeroupper () has been expanded in current - function. */ - BOOL_BITFIELD has_explicit_vzeroupper : 1; - /* True if we should act silently, rather than raise an error for invalid calls. */ BOOL_BITFIELD silent_p : 1; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 2fc8fae30f3..bd012ab1d21 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -191,6 +191,10 @@ (define_c_enum "unspec" [ ;; For MOVDIRI and MOVDIR64B support UNSPEC_MOVDIRI UNSPEC_MOVDIR64B + + ;; For insn_callee_abi: + UNSPEC_CALLEE_ABI + ]) (define_c_enum "unspecv" [ diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index abd307ebdb8..7c5b7482833 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1599,8 +1599,9 @@ (define_predicate "vzeroall_pattern" ;; return true if OP is a vzeroupper pattern. (define_predicate "vzeroupper_pattern" (and (match_code "parallel") - (match_code "unspec_volatile" "a") - (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) + (match_code "unspec" "b") + (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI") + (match_test "XVECEXP (XVECEXP (op, 0, 1), 0, 0) == const1_rtx"))) ;; Return true if OP is an addsub vec_merge operation (define_predicate "addsub_vm_operator" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a4503ddcb73..86cf44bae14 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -20857,14 +20857,22 @@ (define_insn "*avx_vzeroall" ;; if the upper 128bits are unused. Initially we expand the instructions ;; as though they had no effect on the SSE registers, but later add SETs and ;; CLOBBERs to the PARALLEL to model the real effect. + (define_expand "avx_vzeroupper" - [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX") + [(parallel [(call (mem:QI (unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)) + (const_int 0)) + (unspec [(const_int 1)] UNSPEC_CALLEE_ABI)])] + "TARGET_AVX" +{ + ix86_expand_avx_vzeroupper (); + DONE; +}) -(define_insn "*avx_vzeroupper" - [(match_parallel 0 "vzeroupper_pattern" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" +(define_insn "avx_vzeroupper_callee_abi" + [(call (mem:QI (unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)) + (const_int 0)) + (unspec [(const_int 1)] UNSPEC_CALLEE_ABI)] + "TARGET_AVX" "vzeroupper" [(set_attr "type" "sse") (set_attr "modrm" "0") @@ -20873,44 +20881,6 @@ (define_insn "*avx_vzeroupper" (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) -(define_insn_and_split "*avx_vzeroupper_1" - [(match_parallel 0 "vzeroupper_pattern" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" - "#" - "&& epilogue_completed" - [(match_dup 0)] -{ - /* For IPA-RA purposes, make it clear the instruction clobbers - even XMM registers not mentioned explicitly in the pattern. */ - unsigned int nregs = TARGET_64BIT ? 16 : 8; - unsigned int npats = XVECLEN (operands[0], 0); - rtvec vec = rtvec_alloc (nregs + 1); - RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); - for (unsigned int i = 0, j = 1; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (j < npats - && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) - { - RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); - j++; - } - else - { - rtx reg = gen_rtx_REG (V2DImode, regno); - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); - } - } - operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); -} - [(set_attr "type" "sse") - (set_attr "modrm" "0") - (set_attr "memory" "none") - (set_attr "prefix" "vex") - (set_attr "btver2_decode" "vector") - (set_attr "mode" "OI")]) - (define_mode_attr pbroadcast_evex_isa [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") diff --git a/gcc/df-scan.c b/gcc/df-scan.c index 6691c3e8357..845702d9630 100644 --- a/gcc/df-scan.c +++ b/gcc/df-scan.c @@ -3090,7 +3090,9 @@ df_get_call_refs (class df_collection_rec *collection_rec, for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) { - if (i == STACK_POINTER_REGNUM) + /* CALL_INSN use "used" flag to indicate it's a fake call. */ + if (i == STACK_POINTER_REGNUM + && !RTX_FLAG (insn_info->insn, used)) /* The stack ptr is used (honorarily) by a CALL insn. */ df_ref_record (DF_REF_BASE, collection_rec, regno_reg_rtx[i], NULL, bb, insn_info, DF_REF_REG_USE, diff --git a/gcc/final.c b/gcc/final.c index e0a70fcd830..5f8b7b006c2 100644 --- a/gcc/final.c +++ b/gcc/final.c @@ -4109,7 +4109,8 @@ leaf_function_p (void) for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) { if (CALL_P (insn) - && ! SIBLING_CALL_P (insn)) + && ! SIBLING_CALL_P (insn) + && !RTX_FLAG (insn, used)) return 0; if (NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == SEQUENCE diff --git a/gcc/reg-stack.c b/gcc/reg-stack.c index 25210f0c17f..1d9ea035cf4 100644 --- a/gcc/reg-stack.c +++ b/gcc/reg-stack.c @@ -174,6 +174,7 @@ #include "reload.h" #include "tree-pass.h" #include "rtl-iter.h" +#include "function-abi.h" #ifdef STACK_REGS @@ -2368,6 +2369,18 @@ subst_asm_stack_regs (rtx_insn *insn, stack_ptr regstack) } } } + +/* Return true if a function call is allowed to alter some or all bits + of any stack reg. */ +static bool +callee_clobbers_any_stack_reg (const function_abi & callee_abi) +{ + for (unsigned regno = FIRST_STACK_REG; regno <= LAST_STACK_REG; regno++) + if (callee_abi.clobbers_at_least_part_of_reg_p (regno)) + return true; + return false; +} + /* Substitute stack hard reg numbers for stack virtual registers in INSN. Non-stack register numbers are not changed. REGSTACK is the @@ -2382,7 +2395,10 @@ subst_stack_regs (rtx_insn *insn, stack_ptr regstack) bool control_flow_insn_deleted = false; int i; - if (CALL_P (insn)) + /* If the target of the call doesn't clobber any stack registers, + Don't clear the arguments. */ + if (CALL_P (insn) + && callee_clobbers_any_stack_reg (insn_callee_abi (insn))) { int top = regstack->top; diff --git a/gcc/rtl.c b/gcc/rtl.c index b0ba1ff684c..aaee882f5ca 100644 --- a/gcc/rtl.c +++ b/gcc/rtl.c @@ -395,8 +395,10 @@ shallow_copy_rtx (const_rtx orig MEM_STAT_DECL) case SCRATCH: break; default: - /* For all other RTXes clear the used flag on the copy. */ - RTX_FLAG (copy, used) = 0; + /* For all other RTXes clear the used flag on the copy. + CALL_INSN use "used" flag to indicate it's a fake call. */ + if (!INSN_P (orig)) + RTX_FLAG (copy, used) = 0; break; } return copy; diff --git a/gcc/shrink-wrap.c b/gcc/shrink-wrap.c index ba7b5cd56fd..07f9a081dd3 100644 --- a/gcc/shrink-wrap.c +++ b/gcc/shrink-wrap.c @@ -57,7 +57,8 @@ requires_stack_frame_p (rtx_insn *insn, HARD_REG_SET prologue_used, HARD_REG_SET hardregs; unsigned regno; - if (CALL_P (insn)) + /* CALL_INSN use "used" flag to indicate it's a fake call. */ + if (CALL_P (insn) && !RTX_FLAG (insn, used)) return !SIBLING_CALL_P (insn); /* We need a frame to get the unique CFA expected by the unwinder. */ diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c b/gcc/testsuite/gcc.target/i386/pr82735-1.c new file mode 100644 index 00000000000..1a63b9ae9c9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c @@ -0,0 +1,29 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-require-effective-target avx } */ + +#include "avx-check.h" + +void +__attribute__ ((noipa)) +mtest(char *dest) +{ + __m256i ymm1 = _mm256_set1_epi8((char)0x1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm1); + _mm256_zeroupper(); + __m256i ymm2 = _mm256_set1_epi8((char)0x1); + _mm256_storeu_si256((__m256i *)dest, ymm2); +} + +void +avx_test () +{ + char buf[64]; + for (int i = 0; i != 64; i++) + buf[i] = 2; + mtest (buf); + + for (int i = 0; i < 32; ++i) + if (buf[i] != 1) + __builtin_abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c b/gcc/testsuite/gcc.target/i386/pr82735-2.c new file mode 100644 index 00000000000..ac9d006f794 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -O2" } */ + +#include + +void test(char *dest) +{ + /* xmm1 can be propagated to xmm2 by CSE. */ + __m128i xmm1 = _mm_set_epi8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + 0x9, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16); + _mm_storeu_si128((__m128i *)(dest + 32), xmm1); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + __m128i xmm2 = xmm1; + _mm_storeu_si128((__m128i *)dest, xmm2); +} + +/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */ +/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, %xmm[0-9]} 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c b/gcc/testsuite/gcc.target/i386/pr82735-3.c new file mode 100644 index 00000000000..e3f801e6924 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -O2 -mabi=ms" } */ +/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */ + +#include "pr82735-2.c" diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c b/gcc/testsuite/gcc.target/i386/pr82735-4.c new file mode 100644 index 00000000000..78c0a6cb2c8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ + +#include + +void test(char *dest) +{ + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" + "vmovdqa\t%%ymm0, %1\n\t" + "vmovdqa\t%%ymm0, %2\n\t" + "vmovdqa\t%%ymm0, %3\n\t" + "vmovdqa\t%%ymm0, %4\n\t" + "vmovdqa\t%%ymm0, %5\n\t" + "vmovdqa\t%%ymm0, %6\n\t" + "vmovdqa\t%%ymm0, %7\n\t" + "vmovdqa\t%%ymm0, %8\n\t" + "vmovdqa\t%%ymm0, %9\n\t" + "vmovdqa\t%%ymm0, %10\n\t" + "vmovdqa\t%%ymm0, %11\n\t" + "vmovdqa\t%%ymm0, %12\n\t" + "vmovdqa\t%%ymm0, %13\n\t" + "vmovdqa\t%%ymm0, %14\n\t" + "vmovdqa\t%%ymm0, %15\n\t" + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), + "=v"(ymm0) + ::); + _mm256_zeroupper(); + _mm256_storeu_si256((__m256i *)dest, ymm1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); +} diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c b/gcc/testsuite/gcc.target/i386/pr82735-5.c new file mode 100644 index 00000000000..2a58cbe52d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c @@ -0,0 +1,54 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ + +#include + +void test(char *dest) +{ + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" + "vmovdqa\t%%ymm0, %1\n\t" + "vmovdqa\t%%ymm0, %2\n\t" + "vmovdqa\t%%ymm0, %3\n\t" + "vmovdqa\t%%ymm0, %4\n\t" + "vmovdqa\t%%ymm0, %5\n\t" + "vmovdqa\t%%ymm0, %6\n\t" + "vmovdqa\t%%ymm0, %7\n\t" + "vmovdqa\t%%ymm0, %8\n\t" + "vmovdqa\t%%ymm0, %9\n\t" + "vmovdqa\t%%ymm0, %10\n\t" + "vmovdqa\t%%ymm0, %11\n\t" + "vmovdqa\t%%ymm0, %12\n\t" + "vmovdqa\t%%ymm0, %13\n\t" + "vmovdqa\t%%ymm0, %14\n\t" + "vmovdqa\t%%ymm0, %15\n\t" + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), + "=v"(ymm0) + ::); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_storeu_si256((__m256i *)dest, ymm1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); +} -- 2.18.1