Hi Richard, On Fri, 28 Apr 2023 at 14:41, Richard Biener via Gcc-patches < gcc-patches@gcc.gnu.org> wrote: > This adds a scatter vectorization capability to the vectorizer > without target support by decomposing the offset and data vectors > and then performing scalar stores in the order of vector lanes. > This is aimed at cases where vectorizing the rest of the loop > offsets the cost of vectorizing the scatter. > > The offset load is still vectorized and costed as such, but like > with emulated gather those will be turned back to scalar loads > by forwrpop. > > Slightly fixed compared to the version posted in autumn, > re-bootstrapped & tested on x86_64-unknown-linux-gnu and pushed. > > Richard. > > * tree-vect-data-refs.cc (vect_analyze_data_refs): Always > consider scatters. > * tree-vect-stmts.cc (vect_model_store_cost): Pass in the > gather-scatter info and cost emulated scatters accordingly. > (get_load_store_type): Support emulated scatters. > (vectorizable_store): Likewise. Emulate them by extracting > scalar offsets and data, doing scalar stores. > > * gcc.dg/vect/pr25413a.c: Un-XFAIL everywhere. > We are now seeing these failures after this patch was committed: FAIL: gcc.dg/vect/pr25413a.c -flto -ffat-lto-objects scan-tree-dump-times vect "vectorized 2 loops" 1 FAIL: gcc.dg/vect/pr25413a.c scan-tree-dump-times vect "vectorized 2 loops" 1 on aarch64 Christophe * gcc.dg/vect/vect-71.c: Likewise. > * gcc.dg/vect/tsvc/vect-tsvc-s4113.c: Likewise. > * gcc.dg/vect/tsvc/vect-tsvc-s491.c: Likewise. > * gcc.dg/vect/tsvc/vect-tsvc-vas.c: Likewise. > --- > gcc/testsuite/gcc.dg/vect/pr25413a.c | 3 +- > .../gcc.dg/vect/tsvc/vect-tsvc-s4113.c | 2 +- > .../gcc.dg/vect/tsvc/vect-tsvc-s491.c | 2 +- > .../gcc.dg/vect/tsvc/vect-tsvc-vas.c | 2 +- > gcc/testsuite/gcc.dg/vect/vect-71.c | 2 +- > gcc/tree-vect-data-refs.cc | 4 +- > gcc/tree-vect-stmts.cc | 117 ++++++++++++++---- > 7 files changed, 97 insertions(+), 35 deletions(-) > > diff --git a/gcc/testsuite/gcc.dg/vect/pr25413a.c > b/gcc/testsuite/gcc.dg/vect/pr25413a.c > index e444b2c3e8e..ffb517c9ce0 100644 > --- a/gcc/testsuite/gcc.dg/vect/pr25413a.c > +++ b/gcc/testsuite/gcc.dg/vect/pr25413a.c > @@ -123,7 +123,6 @@ int main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { > target { ! vect_scatter_store } } } } */ > -/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { > target vect_scatter_store } } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ > /* { dg-final { scan-tree-dump-times "vector alignment may not be > reachable" 1 "vect" { target { ! vector_alignment_reachable } } } } */ > /* { dg-final { scan-tree-dump-times "Alignment of access forced using > versioning" 1 "vect" { target { ! vector_alignment_reachable } } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c > b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c > index b64682a65df..ddb7e9dc0e8 100644 > --- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c > +++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c > @@ -39,4 +39,4 @@ int main (int argc, char **argv) > return 0; > } > > -/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! > aarch64_sve } } } } */ > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c > b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c > index 8465e137070..29e90ff0aff 100644 > --- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c > +++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c > @@ -39,4 +39,4 @@ int main (int argc, char **argv) > return 0; > } > > -/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! > aarch64_sve } } } } */ > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c > b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c > index 5ff38851f43..b72ee21a9a3 100644 > --- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c > +++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c > @@ -39,4 +39,4 @@ int main (int argc, char **argv) > return 0; > } > > -/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! > aarch64_sve } } } } */ > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-71.c > b/gcc/testsuite/gcc.dg/vect/vect-71.c > index f15521176df..581473fa4a1 100644 > --- a/gcc/testsuite/gcc.dg/vect/vect-71.c > +++ b/gcc/testsuite/gcc.dg/vect/vect-71.c > @@ -36,4 +36,4 @@ int main (void) > return main1 (); > } > > -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { > xfail { ! vect_scatter_store } } } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > index c03ffb3aaf1..6721ab6efc4 100644 > --- a/gcc/tree-vect-data-refs.cc > +++ b/gcc/tree-vect-data-refs.cc > @@ -4464,9 +4464,7 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 > *min_vf, bool *fatal) > && !TREE_THIS_VOLATILE (DR_REF (dr)); > bool maybe_scatter > = DR_IS_WRITE (dr) > - && !TREE_THIS_VOLATILE (DR_REF (dr)) > - && (targetm.vectorize.builtin_scatter != NULL > - || supports_vec_scatter_store_p ()); > + && !TREE_THIS_VOLATILE (DR_REF (dr)); > > /* If target supports vector gather loads or scatter stores, > see if they can't be used. */ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index dc2dc2cfa7e..c71e28737ee 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -942,6 +942,7 @@ cfun_returns (tree decl) > static void > vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int > ncopies, > vect_memory_access_type memory_access_type, > + gather_scatter_info *gs_info, > dr_alignment_support alignment_support_scheme, > int misalignment, > vec_load_store_type vls_type, slp_tree slp_node, > @@ -997,8 +998,16 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info > stmt_info, int ncopies, > if (memory_access_type == VMAT_ELEMENTWISE > || memory_access_type == VMAT_GATHER_SCATTER) > { > - /* N scalar stores plus extracting the elements. */ > unsigned int assumed_nunits = vect_nunits_for_cost (vectype); > + if (memory_access_type == VMAT_GATHER_SCATTER > + && gs_info->ifn == IFN_LAST && !gs_info->decl) > + /* For emulated scatter N offset vector element extracts > + (we assume the scalar scaling and ptr + offset add is consumed > by > + the load). */ > + inside_cost += record_stmt_cost (cost_vec, ncopies * > assumed_nunits, > + vec_to_scalar, stmt_info, 0, > + vect_body); > + /* N scalar stores plus extracting the elements. */ > inside_cost += record_stmt_cost (cost_vec, > ncopies * assumed_nunits, > scalar_store, stmt_info, 0, > vect_body); > @@ -1008,7 +1017,9 @@ vect_model_store_cost (vec_info *vinfo, > stmt_vec_info stmt_info, int ncopies, > misalignment, &inside_cost, cost_vec); > > if (memory_access_type == VMAT_ELEMENTWISE > - || memory_access_type == VMAT_STRIDED_SLP) > + || memory_access_type == VMAT_STRIDED_SLP > + || (memory_access_type == VMAT_GATHER_SCATTER > + && gs_info->ifn == IFN_LAST && !gs_info->decl)) > { > /* N scalar stores plus extracting the elements. */ > unsigned int assumed_nunits = vect_nunits_for_cost (vectype); > @@ -2503,19 +2514,11 @@ get_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > } > else if (gs_info->ifn == IFN_LAST && !gs_info->decl) > { > - if (vls_type != VLS_LOAD) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "unsupported emulated scatter.\n"); > - return false; > - } > - else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant () > - || !TYPE_VECTOR_SUBPARTS > - (gs_info->offset_vectype).is_constant () > - || !constant_multiple_p (TYPE_VECTOR_SUBPARTS > - (gs_info->offset_vectype), > - TYPE_VECTOR_SUBPARTS > (vectype))) > + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant () > + || !TYPE_VECTOR_SUBPARTS > (gs_info->offset_vectype).is_constant () > + || !constant_multiple_p (TYPE_VECTOR_SUBPARTS > + (gs_info->offset_vectype), > + TYPE_VECTOR_SUBPARTS (vectype))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -7824,6 +7827,15 @@ vectorizable_store (vec_info *vinfo, > "unsupported access type for masked > store.\n"); > return false; > } > + else if (memory_access_type == VMAT_GATHER_SCATTER > + && gs_info.ifn == IFN_LAST > + && !gs_info.decl) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "unsupported masked emulated scatter.\n"); > + return false; > + } > } > else > { > @@ -7887,7 +7899,8 @@ vectorizable_store (vec_info *vinfo, > > STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; > vect_model_store_cost (vinfo, stmt_info, ncopies, > - memory_access_type, alignment_support_scheme, > + memory_access_type, &gs_info, > + alignment_support_scheme, > misalignment, vls_type, slp_node, cost_vec); > return true; > } > @@ -8527,12 +8540,9 @@ vectorizable_store (vec_info *vinfo, > dataref_offset = build_int_cst (ref_type, 0); > } > else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - { > - vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, > - slp_node, &gs_info, > &dataref_ptr, > - &vec_offsets); > - vec_offset = vec_offsets[0]; > - } > + vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, > + slp_node, &gs_info, &dataref_ptr, > + &vec_offsets); > else > dataref_ptr > = vect_create_data_ref_ptr (vinfo, first_stmt_info, > aggr_type, > @@ -8558,9 +8568,7 @@ vectorizable_store (vec_info *vinfo, > if (dataref_offset) > dataref_offset > = int_const_binop (PLUS_EXPR, dataref_offset, bump); > - else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - vec_offset = vec_offsets[j]; > - else > + else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > gsi, > stmt_info, bump); > } > @@ -8648,8 +8656,11 @@ vectorizable_store (vec_info *vinfo, > final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > final_mask, vec_mask, gsi); > > - if (memory_access_type == VMAT_GATHER_SCATTER) > + if (memory_access_type == VMAT_GATHER_SCATTER > + && gs_info.ifn != IFN_LAST) > { > + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + vec_offset = vec_offsets[vec_num * j + i]; > tree scale = size_int (gs_info.scale); > gcall *call; > if (final_mask) > @@ -8665,6 +8676,60 @@ vectorizable_store (vec_info *vinfo, > new_stmt = call; > break; > } > + else if (memory_access_type == VMAT_GATHER_SCATTER) > + { > + /* Emulated scatter. */ > + gcc_assert (!final_mask); > + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant > (); > + unsigned HOST_WIDE_INT const_offset_nunits > + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) > + .to_constant (); > + vec *ctor_elts; > + vec_alloc (ctor_elts, const_nunits); > + gimple_seq stmts = NULL; > + tree elt_type = TREE_TYPE (vectype); > + unsigned HOST_WIDE_INT elt_size > + = tree_to_uhwi (TYPE_SIZE (elt_type)); > + /* We support offset vectors with more elements > + than the data vector for now. */ > + unsigned HOST_WIDE_INT factor > + = const_offset_nunits / const_nunits; > + vec_offset = vec_offsets[j / factor]; > + unsigned elt_offset = (j % factor) * const_nunits; > + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); > + tree scale = size_int (gs_info.scale); > + align = get_object_alignment (DR_REF > (first_dr_info->dr)); > + tree ltype = build_aligned_type (TREE_TYPE (vectype), > align); > + for (unsigned k = 0; k < const_nunits; ++k) > + { > + /* Compute the offsetted pointer. */ > + tree boff = size_binop (MULT_EXPR, TYPE_SIZE > (idx_type), > + bitsize_int (k + > elt_offset)); > + tree idx = gimple_build (&stmts, BIT_FIELD_REF, > + idx_type, vec_offset, > + TYPE_SIZE (idx_type), boff); > + idx = gimple_convert (&stmts, sizetype, idx); > + idx = gimple_build (&stmts, MULT_EXPR, > + sizetype, idx, scale); > + tree ptr = gimple_build (&stmts, PLUS_EXPR, > + TREE_TYPE (dataref_ptr), > + dataref_ptr, idx); > + ptr = gimple_convert (&stmts, ptr_type_node, ptr); > + /* Extract the element to be stored. */ > + tree elt = gimple_build (&stmts, BIT_FIELD_REF, > + TREE_TYPE (vectype), > vec_oprnd, > + TYPE_SIZE (elt_type), > + bitsize_int (k * elt_size)); > + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > + stmts = NULL; > + tree ref = build2 (MEM_REF, ltype, ptr, > + build_int_cst (ref_type, 0)); > + new_stmt = gimple_build_assign (ref, elt); > + vect_finish_stmt_generation (vinfo, stmt_info, > + new_stmt, gsi); > + } > + break; > + } > > if (i > 0) > /* Bump the vector pointer. */ > -- > 2.35.3 >