* [PATCH, RFC] First cut at using vec_construct for strided loads @ 2012-06-13 4:32 William J. Schmidt 2012-06-13 9:32 ` Richard Guenther 2016-06-08 12:30 ` Richard Biener 0 siblings, 2 replies; 7+ messages in thread From: William J. Schmidt @ 2012-06-13 4:32 UTC (permalink / raw) To: gcc-patches; +Cc: rguenther, bergner This patch is a follow-up to the discussion generated by http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added vec_construct to the cost model for use in vect_model_load_cost, and implemented a cost calculation that makes sense to me for PowerPC. I'm less certain about the default, i386, and spu implementations. I took a guess at i386 from the discussions we had, and used the same calculation for the default and for spu. I'm hoping you or others can fill in the blanks if I guessed badly. The i386 cost for vec_construct is different from all the others, which are parameterized for each processor description. This should probably be parameterized in some way as well, but thought you'd know better than I how that should be. Perhaps instead of elements / 2 + 1 it should be (elements / 2) * X + Y where X and Y are taken from the processor description, and represent the cost of a merge and a permute, respectively. Let me know what you think. Thanks, Bill 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com> * targhooks.c (default_builtin_vectorized_conversion): Handle vec_construct, using vectype to base cost on subparts. * target.h (enum vect_cost_for_stmt): Add vec_construct. * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct instead of scalar_to-vec. * config/spu/spu.c (spu_builtin_vectorization_cost): Handle vec_construct in same way as default for now. * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Handle vec_construct, including special case for 32-bit loads. Index: gcc/targhooks.c =================================================================== --- gcc/targhooks.c (revision 188482) +++ gcc/targhooks.c (working copy) @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in int default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype ATTRIBUTE_UNUSED, + tree vectype, int misalign ATTRIBUTE_UNUSED) { + unsigned elements; + switch (type_of_cost) { case scalar_stmt: @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost case cond_branch_taken: return 3; + case vec_construct: + elements = TYPE_VECTOR_SUBPARTS (vectype); + gcc_assert (elements > 1); + return elements / 2 + 1; + default: gcc_unreachable (); } Index: gcc/target.h =================================================================== --- gcc/target.h (revision 188482) +++ gcc/target.h (working copy) @@ -146,7 +146,8 @@ enum vect_cost_for_stmt cond_branch_not_taken, cond_branch_taken, vec_perm, - vec_promote_demote + vec_promote_demote, + vec_construct }; /* The target structure. This holds all the backend hooks. */ Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 188482) +++ gcc/tree-vect-stmts.c (working copy) @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int /* The loads themselves. */ if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) { - /* N scalar loads plus gathering them into a vector. - ??? scalar_to_vec isn't the cost for that. */ + /* N scalar loads plus gathering them into a vector. */ + tree vectype = STMT_VINFO_VECTYPE (stmt_info); inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); + * TYPE_VECTOR_SUBPARTS (vectype)); + inside_cost += ncopies + * targetm.vectorize.builtin_vectorization_cost (vec_construct, + vectype, 0); } else vect_get_load_cost (first_dr, ncopies, Index: gcc/config/spu/spu.c =================================================================== --- gcc/config/spu/spu.c (revision 188482) +++ gcc/config/spu/spu.c (working copy) @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void) /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype ATTRIBUTE_UNUSED, + tree vectype, int misalign ATTRIBUTE_UNUSED) { + unsigned elements; + switch (type_of_cost) { case scalar_stmt: @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for case cond_branch_taken: return 6; + case vec_construct: + elements = TYPE_VECTOR_SUBPARTS (vectype); + gcc_assert (elements > 1); + return elements / 2 + 1; + default: gcc_unreachable (); } Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 188482) +++ gcc/config/i386/i386.c (working copy) @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_ /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype ATTRIBUTE_UNUSED, + tree vectype, int misalign ATTRIBUTE_UNUSED) { + unsigned elements; + switch (type_of_cost) { case scalar_stmt: @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo case vec_promote_demote: return ix86_cost->vec_stmt_cost; + case vec_construct: + elements = TYPE_VECTOR_SUBPARTS (vectype); + gcc_assert (elements > 1); + return elements / 2 + 1; + default: gcc_unreachable (); } Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 188482) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ tree vectype, int misalign) { unsigned elements; + tree elem_type; switch (type_of_cost) { @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ return 2; + case vec_construct: + elements = TYPE_VECTOR_SUBPARTS (vectype); + elem_type = TREE_TYPE (vectype); + gcc_assert (elements > 1); + /* 32-bit vectors loaded into registers are stored as double + precision, so we need n/2 converts in addition to the usual + n/2 merges to construct a vector of short floats from them. */ + if (SCALAR_FLOAT_TYPE_P (elem_type) + && TYPE_PRECISION (elem_type) == 32) + return elements + 1; + else + return elements / 2 + 1; + default: gcc_unreachable (); } ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads 2012-06-13 4:32 [PATCH, RFC] First cut at using vec_construct for strided loads William J. Schmidt @ 2012-06-13 9:32 ` Richard Guenther 2012-06-13 11:46 ` William J. Schmidt 2016-06-08 12:30 ` Richard Biener 1 sibling, 1 reply; 7+ messages in thread From: Richard Guenther @ 2012-06-13 9:32 UTC (permalink / raw) To: William J. Schmidt; +Cc: gcc-patches, bergner [-- Attachment #1: Type: TEXT/PLAIN, Size: 8086 bytes --] On Tue, 12 Jun 2012, William J. Schmidt wrote: > This patch is a follow-up to the discussion generated by > http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added > vec_construct to the cost model for use in vect_model_load_cost, and > implemented a cost calculation that makes sense to me for PowerPC. I'm > less certain about the default, i386, and spu implementations. I took a > guess at i386 from the discussions we had, and used the same calculation > for the default and for spu. I'm hoping you or others can fill in the > blanks if I guessed badly. > > The i386 cost for vec_construct is different from all the others, which > are parameterized for each processor description. This should probably > be parameterized in some way as well, but thought you'd know better than > I how that should be. Perhaps instead of > > elements / 2 + 1 > > it should be > > (elements / 2) * X + Y > > where X and Y are taken from the processor description, and represent > the cost of a merge and a permute, respectively. Let me know what you > think. Looks good to me with the gcc_asserts removed - TYPE_VECTOR_SUBPARTS might be 1 for V1TImode for example (heh, not that the vectorizer would vectorize to that). But I don't see any possible breakage with elements == 1, do you? Target maintainers can improve on the cost calculation if they wish, the default looks sensible to me. Thanks, Richard. > Thanks, > Bill > > > 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com> > > * targhooks.c (default_builtin_vectorized_conversion): Handle > vec_construct, using vectype to base cost on subparts. > * target.h (enum vect_cost_for_stmt): Add vec_construct. > * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct > instead of scalar_to-vec. > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > vec_construct in same way as default for now. > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): > Handle vec_construct, including special case for 32-bit loads. > > > Index: gcc/targhooks.c > =================================================================== > --- gcc/targhooks.c (revision 188482) > +++ gcc/targhooks.c (working copy) > @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in > > int > default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > - tree vectype ATTRIBUTE_UNUSED, > + tree vectype, > int misalign ATTRIBUTE_UNUSED) > { > + unsigned elements; > + > switch (type_of_cost) > { > case scalar_stmt: > @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost > case cond_branch_taken: > return 3; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + gcc_assert (elements > 1); > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > Index: gcc/target.h > =================================================================== > --- gcc/target.h (revision 188482) > +++ gcc/target.h (working copy) > @@ -146,7 +146,8 @@ enum vect_cost_for_stmt > cond_branch_not_taken, > cond_branch_taken, > vec_perm, > - vec_promote_demote > + vec_promote_demote, > + vec_construct > }; > > /* The target structure. This holds all the backend hooks. */ > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c (revision 188482) > +++ gcc/tree-vect-stmts.c (working copy) > @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > /* The loads themselves. */ > if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > { > - /* N scalar loads plus gathering them into a vector. > - ??? scalar_to_vec isn't the cost for that. */ > + /* N scalar loads plus gathering them into a vector. */ > + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies > - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); > - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); > + * TYPE_VECTOR_SUBPARTS (vectype)); > + inside_cost += ncopies > + * targetm.vectorize.builtin_vectorization_cost (vec_construct, > + vectype, 0); > } > else > vect_get_load_cost (first_dr, ncopies, > Index: gcc/config/spu/spu.c > =================================================================== > --- gcc/config/spu/spu.c (revision 188482) > +++ gcc/config/spu/spu.c (working copy) > @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void) > /* Implement targetm.vectorize.builtin_vectorization_cost. */ > static int > spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > - tree vectype ATTRIBUTE_UNUSED, > + tree vectype, > int misalign ATTRIBUTE_UNUSED) > { > + unsigned elements; > + > switch (type_of_cost) > { > case scalar_stmt: > @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for > case cond_branch_taken: > return 6; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + gcc_assert (elements > 1); > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 188482) > +++ gcc/config/i386/i386.c (working copy) > @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_ > /* Implement targetm.vectorize.builtin_vectorization_cost. */ > static int > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > - tree vectype ATTRIBUTE_UNUSED, > + tree vectype, > int misalign ATTRIBUTE_UNUSED) > { > + unsigned elements; > + > switch (type_of_cost) > { > case scalar_stmt: > @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > case vec_promote_demote: > return ix86_cost->vec_stmt_cost; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + gcc_assert (elements > 1); > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > Index: gcc/config/rs6000/rs6000.c > =================================================================== > --- gcc/config/rs6000/rs6000.c (revision 188482) > +++ gcc/config/rs6000/rs6000.c (working copy) > @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > tree vectype, int misalign) > { > unsigned elements; > + tree elem_type; > > switch (type_of_cost) > { > @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > > return 2; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + elem_type = TREE_TYPE (vectype); > + gcc_assert (elements > 1); > + /* 32-bit vectors loaded into registers are stored as double > + precision, so we need n/2 converts in addition to the usual > + n/2 merges to construct a vector of short floats from them. */ > + if (SCALAR_FLOAT_TYPE_P (elem_type) > + && TYPE_PRECISION (elem_type) == 32) > + return elements + 1; > + else > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > > > -- Richard Guenther <rguenther@suse.de> SUSE / SUSE Labs SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads 2012-06-13 9:32 ` Richard Guenther @ 2012-06-13 11:46 ` William J. Schmidt 0 siblings, 0 replies; 7+ messages in thread From: William J. Schmidt @ 2012-06-13 11:46 UTC (permalink / raw) To: Richard Guenther; +Cc: gcc-patches, bergner On Wed, 2012-06-13 at 11:26 +0200, Richard Guenther wrote: > On Tue, 12 Jun 2012, William J. Schmidt wrote: > > > This patch is a follow-up to the discussion generated by > > http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added > > vec_construct to the cost model for use in vect_model_load_cost, and > > implemented a cost calculation that makes sense to me for PowerPC. I'm > > less certain about the default, i386, and spu implementations. I took a > > guess at i386 from the discussions we had, and used the same calculation > > for the default and for spu. I'm hoping you or others can fill in the > > blanks if I guessed badly. > > > > The i386 cost for vec_construct is different from all the others, which > > are parameterized for each processor description. This should probably > > be parameterized in some way as well, but thought you'd know better than > > I how that should be. Perhaps instead of > > > > elements / 2 + 1 > > > > it should be > > > > (elements / 2) * X + Y > > > > where X and Y are taken from the processor description, and represent > > the cost of a merge and a permute, respectively. Let me know what you > > think. > > Looks good to me with the gcc_asserts removed - TYPE_VECTOR_SUBPARTS > might be 1 for V1TImode for example (heh, not that the vectorizer would > vectorize to that). But I don't see any possible breakage with > elements == 1, do you? No, that was some unnecessary sanity testing I was doing for my own curiosity. I'll pull them out and pop this in today. Thanks for the review! Bill > > Target maintainers can improve on the cost calculation if they wish, > the default looks sensible to me. > > Thanks, > Richard. > > > Thanks, > > Bill > > > > > > 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com> > > > > * targhooks.c (default_builtin_vectorized_conversion): Handle > > vec_construct, using vectype to base cost on subparts. > > * target.h (enum vect_cost_for_stmt): Add vec_construct. > > * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct > > instead of scalar_to-vec. > > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > > vec_construct in same way as default for now. > > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): > > Handle vec_construct, including special case for 32-bit loads. > > > > > > Index: gcc/targhooks.c > > =================================================================== > > --- gcc/targhooks.c (revision 188482) > > +++ gcc/targhooks.c (working copy) > > @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in > > > > int > > default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > - tree vectype ATTRIBUTE_UNUSED, > > + tree vectype, > > int misalign ATTRIBUTE_UNUSED) > > { > > + unsigned elements; > > + > > switch (type_of_cost) > > { > > case scalar_stmt: > > @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost > > case cond_branch_taken: > > return 3; > > > > + case vec_construct: > > + elements = TYPE_VECTOR_SUBPARTS (vectype); > > + gcc_assert (elements > 1); > > + return elements / 2 + 1; > > + > > default: > > gcc_unreachable (); > > } > > Index: gcc/target.h > > =================================================================== > > --- gcc/target.h (revision 188482) > > +++ gcc/target.h (working copy) > > @@ -146,7 +146,8 @@ enum vect_cost_for_stmt > > cond_branch_not_taken, > > cond_branch_taken, > > vec_perm, > > - vec_promote_demote > > + vec_promote_demote, > > + vec_construct > > }; > > > > /* The target structure. This holds all the backend hooks. */ > > Index: gcc/tree-vect-stmts.c > > =================================================================== > > --- gcc/tree-vect-stmts.c (revision 188482) > > +++ gcc/tree-vect-stmts.c (working copy) > > @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > > /* The loads themselves. */ > > if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > > { > > - /* N scalar loads plus gathering them into a vector. > > - ??? scalar_to_vec isn't the cost for that. */ > > + /* N scalar loads plus gathering them into a vector. */ > > + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies > > - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); > > - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); > > + * TYPE_VECTOR_SUBPARTS (vectype)); > > + inside_cost += ncopies > > + * targetm.vectorize.builtin_vectorization_cost (vec_construct, > > + vectype, 0); > > } > > else > > vect_get_load_cost (first_dr, ncopies, > > Index: gcc/config/spu/spu.c > > =================================================================== > > --- gcc/config/spu/spu.c (revision 188482) > > +++ gcc/config/spu/spu.c (working copy) > > @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void) > > /* Implement targetm.vectorize.builtin_vectorization_cost. */ > > static int > > spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > - tree vectype ATTRIBUTE_UNUSED, > > + tree vectype, > > int misalign ATTRIBUTE_UNUSED) > > { > > + unsigned elements; > > + > > switch (type_of_cost) > > { > > case scalar_stmt: > > @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for > > case cond_branch_taken: > > return 6; > > > > + case vec_construct: > > + elements = TYPE_VECTOR_SUBPARTS (vectype); > > + gcc_assert (elements > 1); > > + return elements / 2 + 1; > > + > > default: > > gcc_unreachable (); > > } > > Index: gcc/config/i386/i386.c > > =================================================================== > > --- gcc/config/i386/i386.c (revision 188482) > > +++ gcc/config/i386/i386.c (working copy) > > @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_ > > /* Implement targetm.vectorize.builtin_vectorization_cost. */ > > static int > > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > - tree vectype ATTRIBUTE_UNUSED, > > + tree vectype, > > int misalign ATTRIBUTE_UNUSED) > > { > > + unsigned elements; > > + > > switch (type_of_cost) > > { > > case scalar_stmt: > > @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > > case vec_promote_demote: > > return ix86_cost->vec_stmt_cost; > > > > + case vec_construct: > > + elements = TYPE_VECTOR_SUBPARTS (vectype); > > + gcc_assert (elements > 1); > > + return elements / 2 + 1; > > + > > default: > > gcc_unreachable (); > > } > > Index: gcc/config/rs6000/rs6000.c > > =================================================================== > > --- gcc/config/rs6000/rs6000.c (revision 188482) > > +++ gcc/config/rs6000/rs6000.c (working copy) > > @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > > tree vectype, int misalign) > > { > > unsigned elements; > > + tree elem_type; > > > > switch (type_of_cost) > > { > > @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > > > > return 2; > > > > + case vec_construct: > > + elements = TYPE_VECTOR_SUBPARTS (vectype); > > + elem_type = TREE_TYPE (vectype); > > + gcc_assert (elements > 1); > > + /* 32-bit vectors loaded into registers are stored as double > > + precision, so we need n/2 converts in addition to the usual > > + n/2 merges to construct a vector of short floats from them. */ > > + if (SCALAR_FLOAT_TYPE_P (elem_type) > > + && TYPE_PRECISION (elem_type) == 32) > > + return elements + 1; > > + else > > + return elements / 2 + 1; > > + > > default: > > gcc_unreachable (); > > } > > > > > > > ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads 2012-06-13 4:32 [PATCH, RFC] First cut at using vec_construct for strided loads William J. Schmidt 2012-06-13 9:32 ` Richard Guenther @ 2016-06-08 12:30 ` Richard Biener 2016-06-08 13:48 ` Bill Schmidt 1 sibling, 1 reply; 7+ messages in thread From: Richard Biener @ 2016-06-08 12:30 UTC (permalink / raw) To: William J. Schmidt, Uros Bizjak Cc: GCC Patches, Richard Biener, Peter Bergner On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt <wschmidt@linux.vnet.ibm.com> wrote: > This patch is a follow-up to the discussion generated by > http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added > vec_construct to the cost model for use in vect_model_load_cost, and > implemented a cost calculation that makes sense to me for PowerPC. I'm > less certain about the default, i386, and spu implementations. I took a > guess at i386 from the discussions we had, and used the same calculation > for the default and for spu. I'm hoping you or others can fill in the > blanks if I guessed badly. > > The i386 cost for vec_construct is different from all the others, which > are parameterized for each processor description. This should probably > be parameterized in some way as well, but thought you'd know better than > I how that should be. Perhaps instead of > > elements / 2 + 1 > > it should be > > (elements / 2) * X + Y > > where X and Y are taken from the processor description, and represent > the cost of a merge and a permute, respectively. Let me know what you > think. Just trying to understand how you arrived at the above formulas in investigating strangely low cost for v16qi construction of 9. If we pairwise reduce elements with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd get with not accounting an initial move of element zero into a vector and then inserting each other element into that with elements - 1 inserts. This also matches up with code-generation on x86_64 for vT foo (T a, T b, ...) { return (vT) {a, b, ... }; } for any vector / element type combination I tried. Thus the patch below. I'll bootstrap / test that on x86_64-linux and I'm leaving other targets to target maintainers. Ok for the i386 parts? Thanks, Richard. 2016-06-08 Richard Biener <rguenther@suse.de> * targhooks.c (default_builtin_vectorization_cost): Adjust vec_construct cost. * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. Index: gcc/targhooks.c =================================================================== --- gcc/targhooks.c (revision 237196) +++ gcc/targhooks.c (working copy) @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum return 3; case vec_construct: - elements = TYPE_VECTOR_SUBPARTS (vectype); - return elements / 2 + 1; + return TYPE_VECTOR_SUBPARTS (vectype) - 1; default: gcc_unreachable (); Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 237196) +++ gcc/config/i386/i386.c (working copy) @@ -49503,8 +49520,6 @@ static int ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, tree vectype, int) { - unsigned elements; - switch (type_of_cost) { case scalar_stmt: @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve return ix86_cost->vec_stmt_cost; case vec_construct: - elements = TYPE_VECTOR_SUBPARTS (vectype); - return ix86_cost->vec_stmt_cost * (elements / 2 + 1); + return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1); default: gcc_unreachable (); > Thanks, > Bill > > > 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com> > > * targhooks.c (default_builtin_vectorized_conversion): Handle > vec_construct, using vectype to base cost on subparts. > * target.h (enum vect_cost_for_stmt): Add vec_construct. > * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct > instead of scalar_to-vec. > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > vec_construct in same way as default for now. > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): > Handle vec_construct, including special case for 32-bit loads. > > > Index: gcc/targhooks.c > =================================================================== > --- gcc/targhooks.c (revision 188482) > +++ gcc/targhooks.c (working copy) > @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in > > int > default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > - tree vectype ATTRIBUTE_UNUSED, > + tree vectype, > int misalign ATTRIBUTE_UNUSED) > { > + unsigned elements; > + > switch (type_of_cost) > { > case scalar_stmt: > @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost > case cond_branch_taken: > return 3; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + gcc_assert (elements > 1); > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > Index: gcc/target.h > =================================================================== > --- gcc/target.h (revision 188482) > +++ gcc/target.h (working copy) > @@ -146,7 +146,8 @@ enum vect_cost_for_stmt > cond_branch_not_taken, > cond_branch_taken, > vec_perm, > - vec_promote_demote > + vec_promote_demote, > + vec_construct > }; > > /* The target structure. This holds all the backend hooks. */ > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c (revision 188482) > +++ gcc/tree-vect-stmts.c (working copy) > @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > /* The loads themselves. */ > if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > { > - /* N scalar loads plus gathering them into a vector. > - ??? scalar_to_vec isn't the cost for that. */ > + /* N scalar loads plus gathering them into a vector. */ > + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies > - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); > - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); > + * TYPE_VECTOR_SUBPARTS (vectype)); > + inside_cost += ncopies > + * targetm.vectorize.builtin_vectorization_cost (vec_construct, > + vectype, 0); > } > else > vect_get_load_cost (first_dr, ncopies, > Index: gcc/config/spu/spu.c > =================================================================== > --- gcc/config/spu/spu.c (revision 188482) > +++ gcc/config/spu/spu.c (working copy) > @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void) > /* Implement targetm.vectorize.builtin_vectorization_cost. */ > static int > spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > - tree vectype ATTRIBUTE_UNUSED, > + tree vectype, > int misalign ATTRIBUTE_UNUSED) > { > + unsigned elements; > + > switch (type_of_cost) > { > case scalar_stmt: > @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for > case cond_branch_taken: > return 6; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + gcc_assert (elements > 1); > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 188482) > +++ gcc/config/i386/i386.c (working copy) > @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_ > /* Implement targetm.vectorize.builtin_vectorization_cost. */ > static int > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > - tree vectype ATTRIBUTE_UNUSED, > + tree vectype, > int misalign ATTRIBUTE_UNUSED) > { > + unsigned elements; > + > switch (type_of_cost) > { > case scalar_stmt: > @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > case vec_promote_demote: > return ix86_cost->vec_stmt_cost; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + gcc_assert (elements > 1); > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > Index: gcc/config/rs6000/rs6000.c > =================================================================== > --- gcc/config/rs6000/rs6000.c (revision 188482) > +++ gcc/config/rs6000/rs6000.c (working copy) > @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > tree vectype, int misalign) > { > unsigned elements; > + tree elem_type; > > switch (type_of_cost) > { > @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > > return 2; > > + case vec_construct: > + elements = TYPE_VECTOR_SUBPARTS (vectype); > + elem_type = TREE_TYPE (vectype); > + gcc_assert (elements > 1); > + /* 32-bit vectors loaded into registers are stored as double > + precision, so we need n/2 converts in addition to the usual > + n/2 merges to construct a vector of short floats from them. */ > + if (SCALAR_FLOAT_TYPE_P (elem_type) > + && TYPE_PRECISION (elem_type) == 32) > + return elements + 1; > + else > + return elements / 2 + 1; > + > default: > gcc_unreachable (); > } > > ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads 2016-06-08 12:30 ` Richard Biener @ 2016-06-08 13:48 ` Bill Schmidt 2016-06-08 14:05 ` Richard Biener 0 siblings, 1 reply; 7+ messages in thread From: Bill Schmidt @ 2016-06-08 13:48 UTC (permalink / raw) To: Richard Biener; +Cc: Uros Bizjak, GCC Patches, Richard Biener, Peter Bergner Hi Richard, > On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote: > > On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt > <wschmidt@linux.vnet.ibm.com> wrote: >> This patch is a follow-up to the discussion generated by >> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added >> vec_construct to the cost model for use in vect_model_load_cost, and >> implemented a cost calculation that makes sense to me for PowerPC. I'm >> less certain about the default, i386, and spu implementations. I took a >> guess at i386 from the discussions we had, and used the same calculation >> for the default and for spu. I'm hoping you or others can fill in the >> blanks if I guessed badly. >> >> The i386 cost for vec_construct is different from all the others, which >> are parameterized for each processor description. This should probably >> be parameterized in some way as well, but thought you'd know better than >> I how that should be. Perhaps instead of >> >> elements / 2 + 1 >> >> it should be >> >> (elements / 2) * X + Y >> >> where X and Y are taken from the processor description, and represent >> the cost of a merge and a permute, respectively. Let me know what you >> think. > > Just trying to understand how you arrived at the above formulas in investigating > strangely low cost for v16qi construction of 9. If we pairwise reduce elements > with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd > get with not accounting an initial move of element zero into a vector and then > inserting each other element into that with elements - 1 inserts. What I wrote there only makes partial sense for certain types on Power, so far as I can tell, and even then it doesn’t generalize properly. When the scalar registers are contained in the vector registers (as happens for floating-point on Power), then you can do some merges and other forms of permutes to combine them faster than doing specific inserts. But that isn’t a general solution even on Power; for the integer modes we still do inserts. So what you have makes sense to me, and what’s currently in place for Power needs work also, so far as I can tell. I’ll take a note to revisit this. — Bill > > This also matches up with code-generation on x86_64 for > > vT foo (T a, T b, ...) > { > return (vT) {a, b, ... }; > } > > for any vector / element type combination I tried. Thus the patch below. > > I'll bootstrap / test that on x86_64-linux and I'm leaving other > targets to target > maintainers. > > Ok for the i386 parts? > > Thanks, > Richard. > > 2016-06-08 Richard Biener <rguenther@suse.de> > > * targhooks.c (default_builtin_vectorization_cost): Adjust > vec_construct cost. > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > > Index: gcc/targhooks.c > =================================================================== > --- gcc/targhooks.c (revision 237196) > +++ gcc/targhooks.c (working copy) > @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum > return 3; > > case vec_construct: > - elements = TYPE_VECTOR_SUBPARTS (vectype); > - return elements / 2 + 1; > + return TYPE_VECTOR_SUBPARTS (vectype) - 1; > > default: > gcc_unreachable (); > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 237196) > +++ gcc/config/i386/i386.c (working copy) > @@ -49503,8 +49520,6 @@ static int > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > tree vectype, int) > { > - unsigned elements; > - > switch (type_of_cost) > { > case scalar_stmt: > @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve > return ix86_cost->vec_stmt_cost; > > case vec_construct: > - elements = TYPE_VECTOR_SUBPARTS (vectype); > - return ix86_cost->vec_stmt_cost * (elements / 2 + 1); > + return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1); > > default: > gcc_unreachable (); > > >> Thanks, >> Bill >> >> >> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com> >> >> * targhooks.c (default_builtin_vectorized_conversion): Handle >> vec_construct, using vectype to base cost on subparts. >> * target.h (enum vect_cost_for_stmt): Add vec_construct. >> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct >> instead of scalar_to-vec. >> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle >> vec_construct in same way as default for now. >> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. >> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): >> Handle vec_construct, including special case for 32-bit loads. >> >> >> Index: gcc/targhooks.c >> =================================================================== >> --- gcc/targhooks.c (revision 188482) >> +++ gcc/targhooks.c (working copy) >> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in >> >> int >> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, >> - tree vectype ATTRIBUTE_UNUSED, >> + tree vectype, >> int misalign ATTRIBUTE_UNUSED) >> { >> + unsigned elements; >> + >> switch (type_of_cost) >> { >> case scalar_stmt: >> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost >> case cond_branch_taken: >> return 3; >> >> + case vec_construct: >> + elements = TYPE_VECTOR_SUBPARTS (vectype); >> + gcc_assert (elements > 1); >> + return elements / 2 + 1; >> + >> default: >> gcc_unreachable (); >> } >> Index: gcc/target.h >> =================================================================== >> --- gcc/target.h (revision 188482) >> +++ gcc/target.h (working copy) >> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt >> cond_branch_not_taken, >> cond_branch_taken, >> vec_perm, >> - vec_promote_demote >> + vec_promote_demote, >> + vec_construct >> }; >> >> /* The target structure. This holds all the backend hooks. */ >> Index: gcc/tree-vect-stmts.c >> =================================================================== >> --- gcc/tree-vect-stmts.c (revision 188482) >> +++ gcc/tree-vect-stmts.c (working copy) >> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int >> /* The loads themselves. */ >> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) >> { >> - /* N scalar loads plus gathering them into a vector. >> - ??? scalar_to_vec isn't the cost for that. */ >> + /* N scalar loads plus gathering them into a vector. */ >> + tree vectype = STMT_VINFO_VECTYPE (stmt_info); >> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies >> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); >> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); >> + * TYPE_VECTOR_SUBPARTS (vectype)); >> + inside_cost += ncopies >> + * targetm.vectorize.builtin_vectorization_cost (vec_construct, >> + vectype, 0); >> } >> else >> vect_get_load_cost (first_dr, ncopies, >> Index: gcc/config/spu/spu.c >> =================================================================== >> --- gcc/config/spu/spu.c (revision 188482) >> +++ gcc/config/spu/spu.c (working copy) >> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void) >> /* Implement targetm.vectorize.builtin_vectorization_cost. */ >> static int >> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, >> - tree vectype ATTRIBUTE_UNUSED, >> + tree vectype, >> int misalign ATTRIBUTE_UNUSED) >> { >> + unsigned elements; >> + >> switch (type_of_cost) >> { >> case scalar_stmt: >> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for >> case cond_branch_taken: >> return 6; >> >> + case vec_construct: >> + elements = TYPE_VECTOR_SUBPARTS (vectype); >> + gcc_assert (elements > 1); >> + return elements / 2 + 1; >> + >> default: >> gcc_unreachable (); >> } >> Index: gcc/config/i386/i386.c >> =================================================================== >> --- gcc/config/i386/i386.c (revision 188482) >> +++ gcc/config/i386/i386.c (working copy) >> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_ >> /* Implement targetm.vectorize.builtin_vectorization_cost. */ >> static int >> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, >> - tree vectype ATTRIBUTE_UNUSED, >> + tree vectype, >> int misalign ATTRIBUTE_UNUSED) >> { >> + unsigned elements; >> + >> switch (type_of_cost) >> { >> case scalar_stmt: >> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo >> case vec_promote_demote: >> return ix86_cost->vec_stmt_cost; >> >> + case vec_construct: >> + elements = TYPE_VECTOR_SUBPARTS (vectype); >> + gcc_assert (elements > 1); >> + return elements / 2 + 1; >> + >> default: >> gcc_unreachable (); >> } >> Index: gcc/config/rs6000/rs6000.c >> =================================================================== >> --- gcc/config/rs6000/rs6000.c (revision 188482) >> +++ gcc/config/rs6000/rs6000.c (working copy) >> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ >> tree vectype, int misalign) >> { >> unsigned elements; >> + tree elem_type; >> >> switch (type_of_cost) >> { >> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ >> >> return 2; >> >> + case vec_construct: >> + elements = TYPE_VECTOR_SUBPARTS (vectype); >> + elem_type = TREE_TYPE (vectype); >> + gcc_assert (elements > 1); >> + /* 32-bit vectors loaded into registers are stored as double >> + precision, so we need n/2 converts in addition to the usual >> + n/2 merges to construct a vector of short floats from them. */ >> + if (SCALAR_FLOAT_TYPE_P (elem_type) >> + && TYPE_PRECISION (elem_type) == 32) >> + return elements + 1; >> + else >> + return elements / 2 + 1; >> + >> default: >> gcc_unreachable (); >> } >> >> > ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads 2016-06-08 13:48 ` Bill Schmidt @ 2016-06-08 14:05 ` Richard Biener 2016-06-09 15:27 ` Bill Schmidt 0 siblings, 1 reply; 7+ messages in thread From: Richard Biener @ 2016-06-08 14:05 UTC (permalink / raw) To: Bill Schmidt; +Cc: Uros Bizjak, GCC Patches, Peter Bergner [-- Attachment #1: Type: TEXT/PLAIN, Size: 11864 bytes --] On Wed, 8 Jun 2016, Bill Schmidt wrote: > Hi Richard, > > > On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote: > > > > On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt > > <wschmidt@linux.vnet.ibm.com> wrote: > >> This patch is a follow-up to the discussion generated by > >> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added > >> vec_construct to the cost model for use in vect_model_load_cost, and > >> implemented a cost calculation that makes sense to me for PowerPC. I'm > >> less certain about the default, i386, and spu implementations. I took a > >> guess at i386 from the discussions we had, and used the same calculation > >> for the default and for spu. I'm hoping you or others can fill in the > >> blanks if I guessed badly. > >> > >> The i386 cost for vec_construct is different from all the others, which > >> are parameterized for each processor description. This should probably > >> be parameterized in some way as well, but thought you'd know better than > >> I how that should be. Perhaps instead of > >> > >> elements / 2 + 1 > >> > >> it should be > >> > >> (elements / 2) * X + Y > >> > >> where X and Y are taken from the processor description, and represent > >> the cost of a merge and a permute, respectively. Let me know what you > >> think. > > > > Just trying to understand how you arrived at the above formulas in investigating > > strangely low cost for v16qi construction of 9. If we pairwise reduce elements > > with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd > > get with not accounting an initial move of element zero into a vector and then > > inserting each other element into that with elements - 1 inserts. > > What I wrote there only makes partial sense for certain types on Power, so far as > I can tell, and even then it doesnât generalize properly. When the scalar registers > are contained in the vector registers (as happens for floating-point on Power), then > you can do some merges and other forms of permutes to combine them faster > than doing specific inserts. But that isnât a general solution even on Power; for the > integer modes we still do inserts. You mean Power has instructions to combine more than two vector registers into one? Otherwise you still need n / 2 plus n / 4 plus n / 8 ... "permutes" which boils down to n - 1. > So what you have makes sense to me, and whatâs currently in place for Power needs > work also, so far as I can tell. Iâll take a note to revisit this. Thanks. Richard. > â Bill > > > > > This also matches up with code-generation on x86_64 for > > > > vT foo (T a, T b, ...) > > { > > return (vT) {a, b, ... }; > > } > > > > for any vector / element type combination I tried. Thus the patch below. > > > > I'll bootstrap / test that on x86_64-linux and I'm leaving other > > targets to target > > maintainers. > > > > Ok for the i386 parts? > > > > Thanks, > > Richard. > > > > 2016-06-08 Richard Biener <rguenther@suse.de> > > > > * targhooks.c (default_builtin_vectorization_cost): Adjust > > vec_construct cost. > > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > > > > Index: gcc/targhooks.c > > =================================================================== > > --- gcc/targhooks.c (revision 237196) > > +++ gcc/targhooks.c (working copy) > > @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum > > return 3; > > > > case vec_construct: > > - elements = TYPE_VECTOR_SUBPARTS (vectype); > > - return elements / 2 + 1; > > + return TYPE_VECTOR_SUBPARTS (vectype) - 1; > > > > default: > > gcc_unreachable (); > > Index: gcc/config/i386/i386.c > > =================================================================== > > --- gcc/config/i386/i386.c (revision 237196) > > +++ gcc/config/i386/i386.c (working copy) > > @@ -49503,8 +49520,6 @@ static int > > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > tree vectype, int) > > { > > - unsigned elements; > > - > > switch (type_of_cost) > > { > > case scalar_stmt: > > @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve > > return ix86_cost->vec_stmt_cost; > > > > case vec_construct: > > - elements = TYPE_VECTOR_SUBPARTS (vectype); > > - return ix86_cost->vec_stmt_cost * (elements / 2 + 1); > > + return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1); > > > > default: > > gcc_unreachable (); > > > > > >> Thanks, > >> Bill > >> > >> > >> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com> > >> > >> * targhooks.c (default_builtin_vectorized_conversion): Handle > >> vec_construct, using vectype to base cost on subparts. > >> * target.h (enum vect_cost_for_stmt): Add vec_construct. > >> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct > >> instead of scalar_to-vec. > >> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle > >> vec_construct in same way as default for now. > >> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > >> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): > >> Handle vec_construct, including special case for 32-bit loads. > >> > >> > >> Index: gcc/targhooks.c > >> =================================================================== > >> --- gcc/targhooks.c (revision 188482) > >> +++ gcc/targhooks.c (working copy) > >> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in > >> > >> int > >> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > >> - tree vectype ATTRIBUTE_UNUSED, > >> + tree vectype, > >> int misalign ATTRIBUTE_UNUSED) > >> { > >> + unsigned elements; > >> + > >> switch (type_of_cost) > >> { > >> case scalar_stmt: > >> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost > >> case cond_branch_taken: > >> return 3; > >> > >> + case vec_construct: > >> + elements = TYPE_VECTOR_SUBPARTS (vectype); > >> + gcc_assert (elements > 1); > >> + return elements / 2 + 1; > >> + > >> default: > >> gcc_unreachable (); > >> } > >> Index: gcc/target.h > >> =================================================================== > >> --- gcc/target.h (revision 188482) > >> +++ gcc/target.h (working copy) > >> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt > >> cond_branch_not_taken, > >> cond_branch_taken, > >> vec_perm, > >> - vec_promote_demote > >> + vec_promote_demote, > >> + vec_construct > >> }; > >> > >> /* The target structure. This holds all the backend hooks. */ > >> Index: gcc/tree-vect-stmts.c > >> =================================================================== > >> --- gcc/tree-vect-stmts.c (revision 188482) > >> +++ gcc/tree-vect-stmts.c (working copy) > >> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > >> /* The loads themselves. */ > >> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > >> { > >> - /* N scalar loads plus gathering them into a vector. > >> - ??? scalar_to_vec isn't the cost for that. */ > >> + /* N scalar loads plus gathering them into a vector. */ > >> + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > >> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies > >> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); > >> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); > >> + * TYPE_VECTOR_SUBPARTS (vectype)); > >> + inside_cost += ncopies > >> + * targetm.vectorize.builtin_vectorization_cost (vec_construct, > >> + vectype, 0); > >> } > >> else > >> vect_get_load_cost (first_dr, ncopies, > >> Index: gcc/config/spu/spu.c > >> =================================================================== > >> --- gcc/config/spu/spu.c (revision 188482) > >> +++ gcc/config/spu/spu.c (working copy) > >> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void) > >> /* Implement targetm.vectorize.builtin_vectorization_cost. */ > >> static int > >> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > >> - tree vectype ATTRIBUTE_UNUSED, > >> + tree vectype, > >> int misalign ATTRIBUTE_UNUSED) > >> { > >> + unsigned elements; > >> + > >> switch (type_of_cost) > >> { > >> case scalar_stmt: > >> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for > >> case cond_branch_taken: > >> return 6; > >> > >> + case vec_construct: > >> + elements = TYPE_VECTOR_SUBPARTS (vectype); > >> + gcc_assert (elements > 1); > >> + return elements / 2 + 1; > >> + > >> default: > >> gcc_unreachable (); > >> } > >> Index: gcc/config/i386/i386.c > >> =================================================================== > >> --- gcc/config/i386/i386.c (revision 188482) > >> +++ gcc/config/i386/i386.c (working copy) > >> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_ > >> /* Implement targetm.vectorize.builtin_vectorization_cost. */ > >> static int > >> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > >> - tree vectype ATTRIBUTE_UNUSED, > >> + tree vectype, > >> int misalign ATTRIBUTE_UNUSED) > >> { > >> + unsigned elements; > >> + > >> switch (type_of_cost) > >> { > >> case scalar_stmt: > >> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > >> case vec_promote_demote: > >> return ix86_cost->vec_stmt_cost; > >> > >> + case vec_construct: > >> + elements = TYPE_VECTOR_SUBPARTS (vectype); > >> + gcc_assert (elements > 1); > >> + return elements / 2 + 1; > >> + > >> default: > >> gcc_unreachable (); > >> } > >> Index: gcc/config/rs6000/rs6000.c > >> =================================================================== > >> --- gcc/config/rs6000/rs6000.c (revision 188482) > >> +++ gcc/config/rs6000/rs6000.c (working copy) > >> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > >> tree vectype, int misalign) > >> { > >> unsigned elements; > >> + tree elem_type; > >> > >> switch (type_of_cost) > >> { > >> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > >> > >> return 2; > >> > >> + case vec_construct: > >> + elements = TYPE_VECTOR_SUBPARTS (vectype); > >> + elem_type = TREE_TYPE (vectype); > >> + gcc_assert (elements > 1); > >> + /* 32-bit vectors loaded into registers are stored as double > >> + precision, so we need n/2 converts in addition to the usual > >> + n/2 merges to construct a vector of short floats from them. */ > >> + if (SCALAR_FLOAT_TYPE_P (elem_type) > >> + && TYPE_PRECISION (elem_type) == 32) > >> + return elements + 1; > >> + else > >> + return elements / 2 + 1; > >> + > >> default: > >> gcc_unreachable (); > >> } > >> > >> > > > > -- Richard Biener <rguenther@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads 2016-06-08 14:05 ` Richard Biener @ 2016-06-09 15:27 ` Bill Schmidt 0 siblings, 0 replies; 7+ messages in thread From: Bill Schmidt @ 2016-06-09 15:27 UTC (permalink / raw) To: Richard Biener; +Cc: Uros Bizjak, GCC Patches, Peter Bergner > On Jun 8, 2016, at 9:05 AM, Richard Biener <rguenther@suse.de> wrote: > > On Wed, 8 Jun 2016, Bill Schmidt wrote: > >> Hi Richard, >> >>> On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote: >>> >>> On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt >>> <wschmidt@linux.vnet.ibm.com> wrote: >>>> This patch is a follow-up to the discussion generated by >>>> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added >>>> vec_construct to the cost model for use in vect_model_load_cost, and >>>> implemented a cost calculation that makes sense to me for PowerPC. I'm >>>> less certain about the default, i386, and spu implementations. I took a >>>> guess at i386 from the discussions we had, and used the same calculation >>>> for the default and for spu. I'm hoping you or others can fill in the >>>> blanks if I guessed badly. >>>> >>>> The i386 cost for vec_construct is different from all the others, which >>>> are parameterized for each processor description. This should probably >>>> be parameterized in some way as well, but thought you'd know better than >>>> I how that should be. Perhaps instead of >>>> >>>> elements / 2 + 1 >>>> >>>> it should be >>>> >>>> (elements / 2) * X + Y >>>> >>>> where X and Y are taken from the processor description, and represent >>>> the cost of a merge and a permute, respectively. Let me know what you >>>> think. >>> >>> Just trying to understand how you arrived at the above formulas in investigating >>> strangely low cost for v16qi construction of 9. If we pairwise reduce elements >>> with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd >>> get with not accounting an initial move of element zero into a vector and then >>> inserting each other element into that with elements - 1 inserts. >> >> What I wrote there only makes partial sense for certain types on Power, so far as >> I can tell, and even then it doesn’t generalize properly. When the scalar registers >> are contained in the vector registers (as happens for floating-point on Power), then >> you can do some merges and other forms of permutes to combine them faster >> than doing specific inserts. But that isn’t a general solution even on Power; for the >> integer modes we still do inserts. > > You mean Power has instructions to combine more than two vector registers > into one? Otherwise you still need n / 2 plus n / 4 plus n / 8 ... > "permutes" which boils down to n - 1. Right, we do not. This is what I meant about it not generalizing properly -- it was an ill-thought-out, off-the-cuff remark, so far as I can tell. I agree with the n - 1, and I need to go in and make a similar change for Power. There is a special case with 32-bit floating-point that probably also needs adjustment. Bill > >> So what you have makes sense to me, and what’s currently in place for Power needs >> work also, so far as I can tell. I’ll take a note to revisit this. > > Thanks. > Richard. > >> — Bill >> >>> >>> This also matches up with code-generation on x86_64 for >>> >>> vT foo (T a, T b, ...) >>> { >>> return (vT) {a, b, ... }; >>> } >>> >>> for any vector / element type combination I tried. Thus the patch below. >>> >>> I'll bootstrap / test that on x86_64-linux and I'm leaving other >>> targets to target >>> maintainers. >>> >>> Ok for the i386 parts? >>> >>> Thanks, >>> Richard. >>> >>> 2016-06-08 Richard Biener <rguenther@suse.de> >>> >>> * targhooks.c (default_builtin_vectorization_cost): Adjust >>> vec_construct cost. >>> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. >>> >>> Index: gcc/targhooks.c >>> =================================================================== >>> --- gcc/targhooks.c (revision 237196) >>> +++ gcc/targhooks.c (working copy) >>> @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum >>> return 3; >>> >>> case vec_construct: >>> - elements = TYPE_VECTOR_SUBPARTS (vectype); >>> - return elements / 2 + 1; >>> + return TYPE_VECTOR_SUBPARTS (vectype) - 1; >>> >>> default: >>> gcc_unreachable (); >>> Index: gcc/config/i386/i386.c >>> =================================================================== >>> --- gcc/config/i386/i386.c (revision 237196) >>> +++ gcc/config/i386/i386.c (working copy) >>> @@ -49503,8 +49520,6 @@ static int >>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, >>> tree vectype, int) >>> { >>> - unsigned elements; >>> - >>> switch (type_of_cost) >>> { >>> case scalar_stmt: >>> @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve >>> return ix86_cost->vec_stmt_cost; >>> >>> case vec_construct: >>> - elements = TYPE_VECTOR_SUBPARTS (vectype); >>> - return ix86_cost->vec_stmt_cost * (elements / 2 + 1); >>> + return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1); >>> >>> default: >>> gcc_unreachable (); >>> >>> >>>> Thanks, >>>> Bill >>>> >>>> >>>> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com> >>>> >>>> * targhooks.c (default_builtin_vectorized_conversion): Handle >>>> vec_construct, using vectype to base cost on subparts. >>>> * target.h (enum vect_cost_for_stmt): Add vec_construct. >>>> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct >>>> instead of scalar_to-vec. >>>> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle >>>> vec_construct in same way as default for now. >>>> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. >>>> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): >>>> Handle vec_construct, including special case for 32-bit loads. >>>> >>>> >>>> Index: gcc/targhooks.c >>>> =================================================================== >>>> --- gcc/targhooks.c (revision 188482) >>>> +++ gcc/targhooks.c (working copy) >>>> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in >>>> >>>> int >>>> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, >>>> - tree vectype ATTRIBUTE_UNUSED, >>>> + tree vectype, >>>> int misalign ATTRIBUTE_UNUSED) >>>> { >>>> + unsigned elements; >>>> + >>>> switch (type_of_cost) >>>> { >>>> case scalar_stmt: >>>> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost >>>> case cond_branch_taken: >>>> return 3; >>>> >>>> + case vec_construct: >>>> + elements = TYPE_VECTOR_SUBPARTS (vectype); >>>> + gcc_assert (elements > 1); >>>> + return elements / 2 + 1; >>>> + >>>> default: >>>> gcc_unreachable (); >>>> } >>>> Index: gcc/target.h >>>> =================================================================== >>>> --- gcc/target.h (revision 188482) >>>> +++ gcc/target.h (working copy) >>>> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt >>>> cond_branch_not_taken, >>>> cond_branch_taken, >>>> vec_perm, >>>> - vec_promote_demote >>>> + vec_promote_demote, >>>> + vec_construct >>>> }; >>>> >>>> /* The target structure. This holds all the backend hooks. */ >>>> Index: gcc/tree-vect-stmts.c >>>> =================================================================== >>>> --- gcc/tree-vect-stmts.c (revision 188482) >>>> +++ gcc/tree-vect-stmts.c (working copy) >>>> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int >>>> /* The loads themselves. */ >>>> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) >>>> { >>>> - /* N scalar loads plus gathering them into a vector. >>>> - ??? scalar_to_vec isn't the cost for that. */ >>>> + /* N scalar loads plus gathering them into a vector. */ >>>> + tree vectype = STMT_VINFO_VECTYPE (stmt_info); >>>> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies >>>> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); >>>> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); >>>> + * TYPE_VECTOR_SUBPARTS (vectype)); >>>> + inside_cost += ncopies >>>> + * targetm.vectorize.builtin_vectorization_cost (vec_construct, >>>> + vectype, 0); >>>> } >>>> else >>>> vect_get_load_cost (first_dr, ncopies, >>>> Index: gcc/config/spu/spu.c >>>> =================================================================== >>>> --- gcc/config/spu/spu.c (revision 188482) >>>> +++ gcc/config/spu/spu.c (working copy) >>>> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void) >>>> /* Implement targetm.vectorize.builtin_vectorization_cost. */ >>>> static int >>>> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, >>>> - tree vectype ATTRIBUTE_UNUSED, >>>> + tree vectype, >>>> int misalign ATTRIBUTE_UNUSED) >>>> { >>>> + unsigned elements; >>>> + >>>> switch (type_of_cost) >>>> { >>>> case scalar_stmt: >>>> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for >>>> case cond_branch_taken: >>>> return 6; >>>> >>>> + case vec_construct: >>>> + elements = TYPE_VECTOR_SUBPARTS (vectype); >>>> + gcc_assert (elements > 1); >>>> + return elements / 2 + 1; >>>> + >>>> default: >>>> gcc_unreachable (); >>>> } >>>> Index: gcc/config/i386/i386.c >>>> =================================================================== >>>> --- gcc/config/i386/i386.c (revision 188482) >>>> +++ gcc/config/i386/i386.c (working copy) >>>> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_ >>>> /* Implement targetm.vectorize.builtin_vectorization_cost. */ >>>> static int >>>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, >>>> - tree vectype ATTRIBUTE_UNUSED, >>>> + tree vectype, >>>> int misalign ATTRIBUTE_UNUSED) >>>> { >>>> + unsigned elements; >>>> + >>>> switch (type_of_cost) >>>> { >>>> case scalar_stmt: >>>> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo >>>> case vec_promote_demote: >>>> return ix86_cost->vec_stmt_cost; >>>> >>>> + case vec_construct: >>>> + elements = TYPE_VECTOR_SUBPARTS (vectype); >>>> + gcc_assert (elements > 1); >>>> + return elements / 2 + 1; >>>> + >>>> default: >>>> gcc_unreachable (); >>>> } >>>> Index: gcc/config/rs6000/rs6000.c >>>> =================================================================== >>>> --- gcc/config/rs6000/rs6000.c (revision 188482) >>>> +++ gcc/config/rs6000/rs6000.c (working copy) >>>> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ >>>> tree vectype, int misalign) >>>> { >>>> unsigned elements; >>>> + tree elem_type; >>>> >>>> switch (type_of_cost) >>>> { >>>> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ >>>> >>>> return 2; >>>> >>>> + case vec_construct: >>>> + elements = TYPE_VECTOR_SUBPARTS (vectype); >>>> + elem_type = TREE_TYPE (vectype); >>>> + gcc_assert (elements > 1); >>>> + /* 32-bit vectors loaded into registers are stored as double >>>> + precision, so we need n/2 converts in addition to the usual >>>> + n/2 merges to construct a vector of short floats from them. */ >>>> + if (SCALAR_FLOAT_TYPE_P (elem_type) >>>> + && TYPE_PRECISION (elem_type) == 32) >>>> + return elements + 1; >>>> + else >>>> + return elements / 2 + 1; >>>> + >>>> default: >>>> gcc_unreachable (); >>>> } >>>> >>>> >>> >> >> > > -- > Richard Biener <rguenther@suse.de> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2016-06-09 15:27 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2012-06-13 4:32 [PATCH, RFC] First cut at using vec_construct for strided loads William J. Schmidt 2012-06-13 9:32 ` Richard Guenther 2012-06-13 11:46 ` William J. Schmidt 2016-06-08 12:30 ` Richard Biener 2016-06-08 13:48 ` Bill Schmidt 2016-06-08 14:05 ` Richard Biener 2016-06-09 15:27 ` Bill Schmidt
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).