On Wed, Jul 14, 2021 at 8:38 PM H.J. Lu wrote: > > On Tue, Jul 13, 2021 at 9:35 PM Hongtao Liu wrote: > > > > On Wed, Jul 14, 2021 at 10:34 AM liuhongt wrote: > > > > > > By optimizing vector movement to broadcast in ix86_expand_vector_move > > > during pass_expand, pass_reload/LRA can automatically generate an avx512 > > > embedded broadcast, pass_cpb is not needed. > > > > > > Considering that in the absence of avx512f, broadcast from memory is > > > still slightly faster than loading the entire memory, so always enable > > > broadcast. > > > > > > benchmark: > > > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast > > > > > > The performance diff > > > > > > strategy : cycles > > > memory : 1046611188 > > > memory : 1255420817 > > > memory : 1044720793 > > > memory : 1253414145 > > > average : 1097868397 > > > > > > broadcast : 1044430688 > > > broadcast : 1044477630 > > > broadcast : 1253554603 > > > broadcast : 1044561934 > > > average : 1096756213 > > > > > > But however broadcast has larger size. > > > > > > the size diff > > > > > > size broadcast.o > > > text data bss dec hex filename > > > 137 0 0 137 89 broadcast.o > > > > > > size memory.o > > > text data bss dec hex filename > > > 115 0 0 115 73 memory.o > > > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,} > > > > > > gcc/ChangeLog: > > > > > > * config/i386/i386-expand.c > > > (ix86_broadcast_from_integer_constant): Rename to .. > > > (ix86_broadcast_from_constant): .. this, and extend it to > > > handle float mode. > > > (ix86_expand_vector_move): Extend to float mode. > > > * config/i386/i386-features.c > > > (replace_constant_pool_with_broadcast): Remove. > > > (remove_partial_avx_dependency_gate): Ditto. > > > (constant_pool_broadcast): Ditto. > > > (class pass_constant_pool_broadcast): Ditto. > > > (make_pass_constant_pool_broadcast): Ditto. > > > (remove_partial_avx_dependency): Adjust gate. > > > * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. > > > * config/i386/i386-protos.h > > > (make_pass_constant_pool_broadcast): Remove. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase. > > > --- > > > gcc/config/i386/i386-expand.c | 29 +++- > > > gcc/config/i386/i386-features.c | 157 +----------------- > > > gcc/config/i386/i386-passes.def | 1 - > > > gcc/config/i386/i386-protos.h | 1 - > > > .../gcc.target/i386/fuse-caller-save-xmm.c | 2 +- > > > 5 files changed, 26 insertions(+), 164 deletions(-) > > > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > > index 69ea79e6123..ba870145acd 100644 > > > --- a/gcc/config/i386/i386-expand.c > > > +++ b/gcc/config/i386/i386-expand.c > > > @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[]) > > > emit_insn (gen_rtx_SET (op0, op1)); > > > } > > > > > > +/* OP is a memref of CONST_VECTOR, return scalar constant mem > > > + if CONST_VECTOR is a vec_duplicate, else return NULL. */ > > > static rtx > > > -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) > > > +ix86_broadcast_from_constant (machine_mode mode, rtx op) > > > { > > > int nunits = GET_MODE_NUNITS (mode); > > > if (nunits < 2) > > > @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) > > > > > > /* Don't use integer vector broadcast if we can't move from GPR to SSE > > > register directly. */ > > > - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) > > > + if (!TARGET_INTER_UNIT_MOVES_TO_VEC > > > + && INTEGRAL_MODE_P (mode)) > > > return nullptr; > > > > > > /* Convert CONST_VECTOR to a non-standard SSE constant integer > > > @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) > > > if (!(TARGET_AVX2 > > > || (TARGET_AVX > > > && (GET_MODE_INNER (mode) == SImode > > > - || GET_MODE_INNER (mode) == DImode))) > > > + || GET_MODE_INNER (mode) == DImode)) > > > + || FLOAT_MODE_P (mode)) > > > || standard_sse_constant_p (op, mode)) > > > return nullptr; > > > > > > - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ > > > - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) > > > + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. > > > + We can still put 64-bit integer constant in memory when > > > + avx512 embed broadcast is available. */ > > > + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT > > > + && (!TARGET_AVX512F > > > + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) > > > return nullptr; > > > > > > if (GET_MODE_INNER (mode) == TImode) > > > @@ -561,17 +569,20 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) > > > > > > if (can_create_pseudo_p () > > > && GET_MODE_SIZE (mode) >= 16 > > > - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > > + && VECTOR_MODE_P (mode) > > > && (MEM_P (op1) > > > && SYMBOL_REF_P (XEXP (op1, 0)) > > > && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) > > > { > > > - rtx first = ix86_broadcast_from_integer_constant (mode, op1); > > > + rtx first = ix86_broadcast_from_constant (mode, op1); > > > if (first != nullptr) > > > { > > > /* Broadcast to XMM/YMM/ZMM register from an integer > > > - constant. */ > > > - op1 = ix86_gen_scratch_sse_rtx (mode); > > > + constant or scalar mem. */ > > > + op1 = gen_reg_rtx (mode); > > I've changed ix86_gen_scratch_sse_rtx to gen_reg_rtx to let LRA/reload > > make the final decision for register usage, would that make sense? > > Hard registers are used for 2 purposes: > > 1. Prevent stack realignment when the original code doesn't use vector > registers, which is the same for memcpy and memset. > 2. Prevent combine to convert constant broadcast to load from constant > pool. > > Please add an argument to ix86_gen_scratch_sse_rtx to use hard registers > for the above 2 cases. You can get pseudo registers for your use cases. > Ok, Here's the patch i'm going to check in. > > w/o that, hard register used here will prevent LRA/reload combine (set > > op1 (vec_duplicate: mem)) and (add (op1, op2)) to (add (op1, > > vec_duplicate: mem)) > > > > > + if (FLOAT_MODE_P (mode) > > > + || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) > > > + first = force_const_mem (GET_MODE_INNER (mode), first); > > > bool ok = ix86_expand_vector_init_duplicate (false, mode, > > > op1, first); > > > gcc_assert (ok); > > > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c > > > index cbd430a2ecf..d9c6652d1c9 100644 > > > --- a/gcc/config/i386/i386-features.c > > > +++ b/gcc/config/i386/i386-features.c > > > @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt) > > > return new pass_insert_endbr_and_patchable_area (ctxt); > > > } > > > > > > -/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x > > > - with embedded broadcast. i.e.transform > > > - > > > - vpaddq .LC0(%rip), %zmm0, %zmm0 > > > - ret > > > - .LC0: > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - > > > - to > > > - > > > - vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 > > > - ret > > > - .LC0: > > > - .quad 3 */ > > > -static void > > > -replace_constant_pool_with_broadcast (rtx_insn *insn) > > > -{ > > > - subrtx_ptr_iterator::array_type array; > > > - FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) > > > - { > > > - rtx *loc = *iter; > > > - rtx x = *loc; > > > - rtx broadcast_mem, vec_dup, constant, first; > > > - machine_mode mode; > > > - > > > - /* Constant pool. */ > > > - if (!MEM_P (x) > > > - || !SYMBOL_REF_P (XEXP (x, 0)) > > > - || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) > > > - continue; > > > - > > > - /* Const vector. */ > > > - mode = GET_MODE (x); > > > - if (!VECTOR_MODE_P (mode)) > > > - return; > > > - constant = get_pool_constant (XEXP (x, 0)); > > > - if (GET_CODE (constant) != CONST_VECTOR) > > > - return; > > > - > > > - /* There could be some rtx like > > > - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) > > > - but with "*.LC1" refer to V2DI constant vector. */ > > > - if (GET_MODE (constant) != mode) > > > - { > > > - constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); > > > - if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR) > > > - return; > > > - } > > > - first = XVECEXP (constant, 0, 0); > > > - > > > - for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) > > > - { > > > - rtx tmp = XVECEXP (constant, 0, i); > > > - /* Vector duplicate value. */ > > > - if (!rtx_equal_p (tmp, first)) > > > - return; > > > - } > > > - > > > - /* Replace with embedded broadcast. */ > > > - broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); > > > - vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); > > > - validate_change (insn, loc, vec_dup, 0); > > > - > > > - /* At most 1 memory_operand in an insn. */ > > > - return; > > > - } > > > -} > > > - > > > /* At entry of the nearest common dominator for basic blocks with > > > conversions, generate a single > > > vxorps %xmmN, %xmmN, %xmmN > > > @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void) > > > if (!NONDEBUG_INSN_P (insn)) > > > continue; > > > > > > - /* Handle AVX512 embedded broadcast here to save compile time. */ > > > - if (TARGET_AVX512F) > > > - replace_constant_pool_with_broadcast (insn); > > > - > > > set = single_set (insn); > > > if (!set) > > > continue; > > > @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void) > > > return 0; > > > } > > > > > > -static bool > > > -remove_partial_avx_dependency_gate () > > > -{ > > > - return (TARGET_AVX > > > - && TARGET_SSE_PARTIAL_REG_DEPENDENCY > > > - && TARGET_SSE_MATH > > > - && optimize > > > - && optimize_function_for_speed_p (cfun)); > > > -} > > > - > > > namespace { > > > > > > const pass_data pass_data_remove_partial_avx_dependency = > > > @@ -2419,7 +2330,11 @@ public: > > > /* opt_pass methods: */ > > > virtual bool gate (function *) > > > { > > > - return remove_partial_avx_dependency_gate (); > > > + return (TARGET_AVX > > > + && TARGET_SSE_PARTIAL_REG_DEPENDENCY > > > + && TARGET_SSE_MATH > > > + && optimize > > > + && optimize_function_for_speed_p (cfun)); > > > } > > > > > > virtual unsigned int execute (function *) > > > @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) > > > return new pass_remove_partial_avx_dependency (ctxt); > > > } > > > > > > -/* For const vector having one duplicated value, there's no need to put > > > - whole vector in the constant pool when target supports embedded broadcast. */ > > > -static unsigned int > > > -constant_pool_broadcast (void) > > > -{ > > > - timevar_push (TV_MACH_DEP); > > > - rtx_insn *insn; > > > - > > > - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) > > > - { > > > - if (INSN_P (insn)) > > > - replace_constant_pool_with_broadcast (insn); > > > - } > > > - timevar_pop (TV_MACH_DEP); > > > - return 0; > > > -} > > > - > > > -namespace { > > > - > > > -const pass_data pass_data_constant_pool_broadcast = > > > -{ > > > - RTL_PASS, /* type */ > > > - "cpb", /* name */ > > > - OPTGROUP_NONE, /* optinfo_flags */ > > > - TV_MACH_DEP, /* tv_id */ > > > - 0, /* properties_required */ > > > - 0, /* properties_provided */ > > > - 0, /* properties_destroyed */ > > > - 0, /* todo_flags_start */ > > > - TODO_df_finish, /* todo_flags_finish */ > > > -}; > > > - > > > -class pass_constant_pool_broadcast : public rtl_opt_pass > > > -{ > > > -public: > > > - pass_constant_pool_broadcast (gcc::context *ctxt) > > > - : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) > > > - {} > > > - > > > - /* opt_pass methods: */ > > > - virtual bool gate (function *) > > > - { > > > - /* Return false if rpad pass gate is true. > > > - replace_constant_pool_with_broadcast is called > > > - from both this pass and rpad pass. */ > > > - return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ()); > > > - } > > > - > > > - virtual unsigned int execute (function *) > > > - { > > > - return constant_pool_broadcast (); > > > - } > > > -}; // class pass_cpb > > > - > > > -} // anon namespace > > > - > > > -rtl_opt_pass * > > > -make_pass_constant_pool_broadcast (gcc::context *ctxt) > > > -{ > > > - return new pass_constant_pool_broadcast (ctxt); > > > -} > > > - > > > /* This compares the priority of target features in function DECL1 > > > and DECL2. It returns positive value if DECL1 is higher priority, > > > negative value if DECL2 is higher priority and 0 if they are the > > > diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def > > > index 44df00e94ac..29baf8acd0b 100644 > > > --- a/gcc/config/i386/i386-passes.def > > > +++ b/gcc/config/i386/i386-passes.def > > > @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see > > > INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); > > > > > > INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); > > > - INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); > > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > > > index 51376fcc454..07ac02aff69 100644 > > > --- a/gcc/config/i386/i386-protos.h > > > +++ b/gcc/config/i386/i386-protos.h > > > @@ -395,4 +395,3 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area > > > (gcc::context *); > > > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > > > (gcc::context *); > > > -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); > > > diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > > index 4deff93c1e8..b0d3dc38a0c 100644 > > > --- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > > +++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > > @@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16))); > > > static v2df __attribute__((noinline)) > > > bar (v2df a) > > > { > > > - return a + (v2df){ 3.0, 3.0 }; > > > + return a + (v2df){ 3.0, 4.0 }; > > > } > > > > > > v2df __attribute__((noinline)) > > > -- > > > 2.18.1 > > > > > > > > > -- > > BR, > > Hongtao > > > > -- > H.J. -- BR, Hongtao