From d53b0c6934ea499c9f87df963661b627e7e977bf Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 12 May 2021 14:20:54 +0800 Subject: [PATCH] [i386] Fix _mm256_zeroupper to notify LRA that vzeroupper will kill sse registers. When __builtin_ia32_vzeroupper is called explicitly, the corresponding vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, which leads to incorrect optimization in pass_reload. In order to solve this problem, this patch introduces a pre_reload splitter which adds CLOBBERS to vzeroupper's pattern, it can solve the problem in pr. At the same time, in order to optimize the low 128 bits in post_reload CSE, this patch also transforms those CLOBBERS to SETs in pass_vzeroupper. It works fine except for TARGET_64BIT_MS_ABI, under which xmm6-xmm15 are callee-saved, so even if there're no other uses of xmm6-xmm15 in the function, because of vzeroupper's pattern, pro_epilog will save and restore those registers, which is obviously redundant. In order to eliminate this redundancy, a post_reload splitter is introduced, which drops those SETs, until epilogue_completed splitter adds those SETs back, it looks to be safe since there's no CSE between post_reload split2 and epilogue_completed split3??? Also frame info needs to be updated in pro_epilog, which saves and restores xmm6-xmm15 only if there's usage other than explicit vzeroupper pattern. gcc/ChangeLog: PR target/82735 * config/i386/i386-expand.c (ix86_expand_builtin): Count number of __builtin_ia32_vzeroupper. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzerouppers): Transform CLOBBERs to SETs for explict vzeroupper pattern so that CSE can optimize lower 128 bits. * config/i386/i386.c (ix86_handle_explicit_vzeroupper_in_pro_epilog): New. (ix86_save_reg): If there's no use of xmm6~xmm15 other than explicit vzeroupper under TARGET_64BIT_MS_ABI, no need to save REGNO. (ix86_finalize_stack_frame_flags): Recompute frame layout if there's explicit vzeroupper under TARGET_64BIT_MS_ABI. * config/i386/i386.h (struct machine_function): Change type of has_explicit_vzeroupper from BOOL_BITFILED to unsigned int. * config/i386/sse.md (*avx_vzeroupper_2): New post-reload splitter which will drop all SETs for explicit vzeroupper patterns. (*avx_vzeroupper_1): Generate SET reg to reg instead of CLOBBER, and add pre-reload splitter after it. gcc/testsuite/ChangeLog: PR target/82735 * gcc.target/i386/pr82735-1.c: New test. * gcc.target/i386/pr82735-2.c: New test. * gcc.target/i386/pr82735-3.c: New test. * gcc.target/i386/pr82735-4.c: New test. * gcc.target/i386/pr82735-5.c: New test. --- gcc/config/i386/i386-expand.c | 2 +- gcc/config/i386/i386-features.c | 25 ++++++++++- gcc/config/i386/i386.c | 23 ++++++++++ gcc/config/i386/i386.h | 8 ++-- gcc/config/i386/sse.md | 48 +++++++++++++++++++- gcc/testsuite/gcc.target/i386/pr82735-1.c | 29 ++++++++++++ gcc/testsuite/gcc.target/i386/pr82735-2.c | 21 +++++++++ gcc/testsuite/gcc.target/i386/pr82735-3.c | 5 +++ gcc/testsuite/gcc.target/i386/pr82735-4.c | 48 ++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr82735-5.c | 54 +++++++++++++++++++++++ 10 files changed, 256 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-5.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index fee4d07b7fd..7f3326a12b2 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13233,7 +13233,7 @@ rdseed_step: return 0; case IX86_BUILTIN_VZEROUPPER: - cfun->machine->has_explicit_vzeroupper = true; + cfun->machine->has_explicit_vzeroupper++; break; default: diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 77783a154b6..6b2179f16cb 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -1827,8 +1827,31 @@ ix86_add_reg_usage_to_vzerouppers (void) { if (!NONDEBUG_INSN_P (insn)) continue; + /* Transform CLOBBERs to SETs so that lower 128 bits of sse reisters + will be able to cross vzeroupper in post-reload CSE. */ if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); + { + if (XVECEXP (XVECEXP (PATTERN (insn), 0, 0), 0, 0) == const1_rtx) + { + unsigned int nregs = TARGET_64BIT ? 16 : 8; + rtvec vec = rtvec_alloc (nregs + 1); + RTVEC_ELT (vec, 0) = XVECEXP (PATTERN (insn), 0, 0); + for (unsigned int i = 0; i < nregs; ++i) + { + unsigned int regno = GET_SSE_REGNO (i); + rtx reg = gen_rtx_REG (V2DImode, regno); + RTVEC_ELT (vec, i + 1) = gen_rtx_SET (reg, reg); + } + XVEC (PATTERN (insn), 0) = vec; + INSN_CODE (insn) = -1; + df_insn_rescan (insn); + } + else + { + gcc_assert (XVECLEN (PATTERN (insn), 0) == 1); + ix86_add_reg_usage_to_vzeroupper (insn, live_regs); + } + } df_simulate_one_insn_backwards (bb, insn, live_regs); } } diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 780da108a7c..4d4d7dbbc82 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -6170,6 +6170,17 @@ ix86_hard_regno_scratch_ok (unsigned int regno) && df_regs_ever_live_p (regno))); } +/* Return true if explicit usage of __builtin_ia32_vzeroupper + should be specially handled in pro_epilog. */ +static bool +ix86_handle_explicit_vzeroupper_in_pro_epilog () +{ + return (cfun->machine->has_explicit_vzeroupper + && TARGET_64BIT_MS_ABI + && !epilogue_completed + && reload_completed); +} + /* Return TRUE if we need to save REGNO. */ bool @@ -6244,6 +6255,16 @@ ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined) && !cfun->machine->no_drap_save_restore) return true; + /* If there's no use other than explicit vzeroupper + for xmm6~xmm15 under TARGET_64BIT_MS_ABI, + no need to save REGNO. */ + if (ix86_handle_explicit_vzeroupper_in_pro_epilog () + && (IN_RANGE (regno, FIRST_SSE_REG + 6, LAST_SSE_REG) + || IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) + return df_regs_ever_live_p (regno) + ? df_hard_reg_used_count (regno) > cfun->machine->has_explicit_vzeroupper + : false; + return (df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno) && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed)); @@ -8046,6 +8067,8 @@ ix86_finalize_stack_frame_flags (void) recompute_frame_layout_p = true; crtl->stack_realign_needed = stack_realign; crtl->stack_realign_finalized = true; + if (ix86_handle_explicit_vzeroupper_in_pro_epilog ()) + recompute_frame_layout_p = true; if (recompute_frame_layout_p) ix86_compute_frame_layout (); } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 97d6f3863cb..c0855a936ac 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2654,10 +2654,6 @@ struct GTY(()) machine_function { /* True if the function needs a stack frame. */ BOOL_BITFIELD stack_frame_required : 1; - /* True if __builtin_ia32_vzeroupper () has been expanded in current - function. */ - BOOL_BITFIELD has_explicit_vzeroupper : 1; - /* True if we should act silently, rather than raise an error for invalid calls. */ BOOL_BITFIELD silent_p : 1; @@ -2665,6 +2661,10 @@ struct GTY(()) machine_function { /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; + /* Number of __builtin_ia32_vzeroupper () which has been expanded in + current function. */ + unsigned int has_explicit_vzeroupper; + /* During prologue/epilogue generation, the current frame state. Otherwise, the frame state at the end of the prologue. */ struct machine_frame_state fs; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 897cf3eaea9..489fa02fa20 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -20626,7 +20626,7 @@ (define_insn_and_split "*avx_vzeroupper_1" else { rtx reg = gen_rtx_REG (V2DImode, regno); - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); + RTVEC_ELT (vec, i + 1) = gen_rtx_SET (reg, reg); } } operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); @@ -20638,6 +20638,52 @@ (define_insn_and_split "*avx_vzeroupper_1" (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) +(define_split + [(match_parallel 0 "vzeroupper_pattern" + [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] + "TARGET_AVX && ix86_pre_reload_split ()" + [(match_dup 0)] +{ + /* When vzeroupper is explictly used, for LRA purpose, make it clear + the instruction kills sse registers. */ + gcc_assert (cfun->machine->has_explicit_vzeroupper); + unsigned int nregs = TARGET_64BIT ? 16 : 8; + rtvec vec = rtvec_alloc (nregs + 1); + RTVEC_ELT (vec, 0) = gen_rtx_UNSPEC_VOLATILE (VOIDmode, + gen_rtvec (1, const1_rtx), + UNSPECV_VZEROUPPER); + for (unsigned int i = 0; i < nregs; ++i) + { + unsigned int regno = GET_SSE_REGNO (i); + rtx reg = gen_rtx_REG (V2DImode, regno); + RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); + } + operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); +}) + +(define_insn_and_split "*avx_vzeroupper_2" + [(match_parallel 0 "vzeroupper_pattern" + [(unspec_volatile [(const_int 1)] UNSPECV_VZEROUPPER)])] + "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" + "vzeroupper" + "&& reload_completed && TARGET_64BIT_MS_ABI" + [(const_int 0)] +{ + /* To avoid redundant save and restore in pro_and_epilog, drop + those SETs/CLOBBERs which are added by pre-reload splitter + or pass_vzeroupper, it's safe since there's no CSE optimization + between post-reload split2 and epilogue-completed split3??? */ + gcc_assert (cfun->machine->has_explicit_vzeroupper); + emit_insn (gen_avx_vzeroupper ()); + DONE; +} + [(set_attr "type" "sse") + (set_attr "modrm" "0") + (set_attr "memory" "none") + (set_attr "prefix" "vex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "OI")]) + (define_mode_attr pbroadcast_evex_isa [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c b/gcc/testsuite/gcc.target/i386/pr82735-1.c new file mode 100644 index 00000000000..1a63b9ae9c9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c @@ -0,0 +1,29 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-require-effective-target avx } */ + +#include "avx-check.h" + +void +__attribute__ ((noipa)) +mtest(char *dest) +{ + __m256i ymm1 = _mm256_set1_epi8((char)0x1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm1); + _mm256_zeroupper(); + __m256i ymm2 = _mm256_set1_epi8((char)0x1); + _mm256_storeu_si256((__m256i *)dest, ymm2); +} + +void +avx_test () +{ + char buf[64]; + for (int i = 0; i != 64; i++) + buf[i] = 2; + mtest (buf); + + for (int i = 0; i < 32; ++i) + if (buf[i] != 1) + __builtin_abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c b/gcc/testsuite/gcc.target/i386/pr82735-2.c new file mode 100644 index 00000000000..48d0d6e983d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -O2" } */ + +#include + +void test(char *dest) +{ + /* xmm1 can be propagated to xmm2 by CSE. */ + __m128i xmm1 = _mm_set1_epi8((char)0x1); + _mm_storeu_si128((__m128i *)(dest + 32), xmm1); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + __m128i xmm2 = _mm_set1_epi8((char)0x1); + _mm_storeu_si128((__m128i *)dest, xmm2); +} + +/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */ +/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, %xmm[0-9]} 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c b/gcc/testsuite/gcc.target/i386/pr82735-3.c new file mode 100644 index 00000000000..e3f801e6924 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -O2 -mabi=ms" } */ +/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */ + +#include "pr82735-2.c" diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c b/gcc/testsuite/gcc.target/i386/pr82735-4.c new file mode 100644 index 00000000000..78c0a6cb2c8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ + +#include + +void test(char *dest) +{ + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" + "vmovdqa\t%%ymm0, %1\n\t" + "vmovdqa\t%%ymm0, %2\n\t" + "vmovdqa\t%%ymm0, %3\n\t" + "vmovdqa\t%%ymm0, %4\n\t" + "vmovdqa\t%%ymm0, %5\n\t" + "vmovdqa\t%%ymm0, %6\n\t" + "vmovdqa\t%%ymm0, %7\n\t" + "vmovdqa\t%%ymm0, %8\n\t" + "vmovdqa\t%%ymm0, %9\n\t" + "vmovdqa\t%%ymm0, %10\n\t" + "vmovdqa\t%%ymm0, %11\n\t" + "vmovdqa\t%%ymm0, %12\n\t" + "vmovdqa\t%%ymm0, %13\n\t" + "vmovdqa\t%%ymm0, %14\n\t" + "vmovdqa\t%%ymm0, %15\n\t" + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), + "=v"(ymm0) + ::); + _mm256_zeroupper(); + _mm256_storeu_si256((__m256i *)dest, ymm1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); +} diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c b/gcc/testsuite/gcc.target/i386/pr82735-5.c new file mode 100644 index 00000000000..2a58cbe52d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c @@ -0,0 +1,54 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ + +#include + +void test(char *dest) +{ + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" + "vmovdqa\t%%ymm0, %1\n\t" + "vmovdqa\t%%ymm0, %2\n\t" + "vmovdqa\t%%ymm0, %3\n\t" + "vmovdqa\t%%ymm0, %4\n\t" + "vmovdqa\t%%ymm0, %5\n\t" + "vmovdqa\t%%ymm0, %6\n\t" + "vmovdqa\t%%ymm0, %7\n\t" + "vmovdqa\t%%ymm0, %8\n\t" + "vmovdqa\t%%ymm0, %9\n\t" + "vmovdqa\t%%ymm0, %10\n\t" + "vmovdqa\t%%ymm0, %11\n\t" + "vmovdqa\t%%ymm0, %12\n\t" + "vmovdqa\t%%ymm0, %13\n\t" + "vmovdqa\t%%ymm0, %14\n\t" + "vmovdqa\t%%ymm0, %15\n\t" + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), + "=v"(ymm0) + ::); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_storeu_si256((__m256i *)dest, ymm1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); +} -- 2.18.1