* [PATCH, RFC] First cut at using vec_construct for strided loads
@ 2012-06-13 4:32 William J. Schmidt
2012-06-13 9:32 ` Richard Guenther
2016-06-08 12:30 ` Richard Biener
0 siblings, 2 replies; 7+ messages in thread
From: William J. Schmidt @ 2012-06-13 4:32 UTC (permalink / raw)
To: gcc-patches; +Cc: rguenther, bergner
This patch is a follow-up to the discussion generated by
http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
vec_construct to the cost model for use in vect_model_load_cost, and
implemented a cost calculation that makes sense to me for PowerPC. I'm
less certain about the default, i386, and spu implementations. I took a
guess at i386 from the discussions we had, and used the same calculation
for the default and for spu. I'm hoping you or others can fill in the
blanks if I guessed badly.
The i386 cost for vec_construct is different from all the others, which
are parameterized for each processor description. This should probably
be parameterized in some way as well, but thought you'd know better than
I how that should be. Perhaps instead of
elements / 2 + 1
it should be
(elements / 2) * X + Y
where X and Y are taken from the processor description, and represent
the cost of a merge and a permute, respectively. Let me know what you
think.
Thanks,
Bill
2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
* targhooks.c (default_builtin_vectorized_conversion): Handle
vec_construct, using vectype to base cost on subparts.
* target.h (enum vect_cost_for_stmt): Add vec_construct.
* tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
instead of scalar_to-vec.
* config/spu/spu.c (spu_builtin_vectorization_cost): Handle
vec_construct in same way as default for now.
* config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
* config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
Handle vec_construct, including special case for 32-bit loads.
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c (revision 188482)
+++ gcc/targhooks.c (working copy)
@@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
int
default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype ATTRIBUTE_UNUSED,
+ tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
+ unsigned elements;
+
switch (type_of_cost)
{
case scalar_stmt:
@@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
case cond_branch_taken:
return 3;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ gcc_assert (elements > 1);
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
Index: gcc/target.h
===================================================================
--- gcc/target.h (revision 188482)
+++ gcc/target.h (working copy)
@@ -146,7 +146,8 @@ enum vect_cost_for_stmt
cond_branch_not_taken,
cond_branch_taken,
vec_perm,
- vec_promote_demote
+ vec_promote_demote,
+ vec_construct
};
/* The target structure. This holds all the backend hooks. */
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c (revision 188482)
+++ gcc/tree-vect-stmts.c (working copy)
@@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
/* The loads themselves. */
if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
{
- /* N scalar loads plus gathering them into a vector.
- ??? scalar_to_vec isn't the cost for that. */
+ /* N scalar loads plus gathering them into a vector. */
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
- * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
- inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
+ * TYPE_VECTOR_SUBPARTS (vectype));
+ inside_cost += ncopies
+ * targetm.vectorize.builtin_vectorization_cost (vec_construct,
+ vectype, 0);
}
else
vect_get_load_cost (first_dr, ncopies,
Index: gcc/config/spu/spu.c
===================================================================
--- gcc/config/spu/spu.c (revision 188482)
+++ gcc/config/spu/spu.c (working copy)
@@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype ATTRIBUTE_UNUSED,
+ tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
+ unsigned elements;
+
switch (type_of_cost)
{
case scalar_stmt:
@@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
case cond_branch_taken:
return 6;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ gcc_assert (elements > 1);
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c (revision 188482)
+++ gcc/config/i386/i386.c (working copy)
@@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype ATTRIBUTE_UNUSED,
+ tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
+ unsigned elements;
+
switch (type_of_cost)
{
case scalar_stmt:
@@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
case vec_promote_demote:
return ix86_cost->vec_stmt_cost;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ gcc_assert (elements > 1);
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c (revision 188482)
+++ gcc/config/rs6000/rs6000.c (working copy)
@@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
tree vectype, int misalign)
{
unsigned elements;
+ tree elem_type;
switch (type_of_cost)
{
@@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
return 2;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ elem_type = TREE_TYPE (vectype);
+ gcc_assert (elements > 1);
+ /* 32-bit vectors loaded into registers are stored as double
+ precision, so we need n/2 converts in addition to the usual
+ n/2 merges to construct a vector of short floats from them. */
+ if (SCALAR_FLOAT_TYPE_P (elem_type)
+ && TYPE_PRECISION (elem_type) == 32)
+ return elements + 1;
+ else
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads
2012-06-13 4:32 [PATCH, RFC] First cut at using vec_construct for strided loads William J. Schmidt
@ 2012-06-13 9:32 ` Richard Guenther
2012-06-13 11:46 ` William J. Schmidt
2016-06-08 12:30 ` Richard Biener
1 sibling, 1 reply; 7+ messages in thread
From: Richard Guenther @ 2012-06-13 9:32 UTC (permalink / raw)
To: William J. Schmidt; +Cc: gcc-patches, bergner
[-- Attachment #1: Type: TEXT/PLAIN, Size: 8086 bytes --]
On Tue, 12 Jun 2012, William J. Schmidt wrote:
> This patch is a follow-up to the discussion generated by
> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
> vec_construct to the cost model for use in vect_model_load_cost, and
> implemented a cost calculation that makes sense to me for PowerPC. I'm
> less certain about the default, i386, and spu implementations. I took a
> guess at i386 from the discussions we had, and used the same calculation
> for the default and for spu. I'm hoping you or others can fill in the
> blanks if I guessed badly.
>
> The i386 cost for vec_construct is different from all the others, which
> are parameterized for each processor description. This should probably
> be parameterized in some way as well, but thought you'd know better than
> I how that should be. Perhaps instead of
>
> elements / 2 + 1
>
> it should be
>
> (elements / 2) * X + Y
>
> where X and Y are taken from the processor description, and represent
> the cost of a merge and a permute, respectively. Let me know what you
> think.
Looks good to me with the gcc_asserts removed - TYPE_VECTOR_SUBPARTS
might be 1 for V1TImode for example (heh, not that the vectorizer would
vectorize to that). But I don't see any possible breakage with
elements == 1, do you?
Target maintainers can improve on the cost calculation if they wish,
the default looks sensible to me.
Thanks,
Richard.
> Thanks,
> Bill
>
>
> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
>
> * targhooks.c (default_builtin_vectorized_conversion): Handle
> vec_construct, using vectype to base cost on subparts.
> * target.h (enum vect_cost_for_stmt): Add vec_construct.
> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
> instead of scalar_to-vec.
> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
> vec_construct in same way as default for now.
> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
> Handle vec_construct, including special case for 32-bit loads.
>
>
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c (revision 188482)
> +++ gcc/targhooks.c (working copy)
> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>
> int
> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> - tree vectype ATTRIBUTE_UNUSED,
> + tree vectype,
> int misalign ATTRIBUTE_UNUSED)
> {
> + unsigned elements;
> +
> switch (type_of_cost)
> {
> case scalar_stmt:
> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
> case cond_branch_taken:
> return 3;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + gcc_assert (elements > 1);
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
> Index: gcc/target.h
> ===================================================================
> --- gcc/target.h (revision 188482)
> +++ gcc/target.h (working copy)
> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
> cond_branch_not_taken,
> cond_branch_taken,
> vec_perm,
> - vec_promote_demote
> + vec_promote_demote,
> + vec_construct
> };
>
> /* The target structure. This holds all the backend hooks. */
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c (revision 188482)
> +++ gcc/tree-vect-stmts.c (working copy)
> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> /* The loads themselves. */
> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> {
> - /* N scalar loads plus gathering them into a vector.
> - ??? scalar_to_vec isn't the cost for that. */
> + /* N scalar loads plus gathering them into a vector. */
> + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
> + * TYPE_VECTOR_SUBPARTS (vectype));
> + inside_cost += ncopies
> + * targetm.vectorize.builtin_vectorization_cost (vec_construct,
> + vectype, 0);
> }
> else
> vect_get_load_cost (first_dr, ncopies,
> Index: gcc/config/spu/spu.c
> ===================================================================
> --- gcc/config/spu/spu.c (revision 188482)
> +++ gcc/config/spu/spu.c (working copy)
> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
> /* Implement targetm.vectorize.builtin_vectorization_cost. */
> static int
> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> - tree vectype ATTRIBUTE_UNUSED,
> + tree vectype,
> int misalign ATTRIBUTE_UNUSED)
> {
> + unsigned elements;
> +
> switch (type_of_cost)
> {
> case scalar_stmt:
> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
> case cond_branch_taken:
> return 6;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + gcc_assert (elements > 1);
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c (revision 188482)
> +++ gcc/config/i386/i386.c (working copy)
> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
> /* Implement targetm.vectorize.builtin_vectorization_cost. */
> static int
> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> - tree vectype ATTRIBUTE_UNUSED,
> + tree vectype,
> int misalign ATTRIBUTE_UNUSED)
> {
> + unsigned elements;
> +
> switch (type_of_cost)
> {
> case scalar_stmt:
> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
> case vec_promote_demote:
> return ix86_cost->vec_stmt_cost;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + gcc_assert (elements > 1);
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c (revision 188482)
> +++ gcc/config/rs6000/rs6000.c (working copy)
> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> tree vectype, int misalign)
> {
> unsigned elements;
> + tree elem_type;
>
> switch (type_of_cost)
> {
> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>
> return 2;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + elem_type = TREE_TYPE (vectype);
> + gcc_assert (elements > 1);
> + /* 32-bit vectors loaded into registers are stored as double
> + precision, so we need n/2 converts in addition to the usual
> + n/2 merges to construct a vector of short floats from them. */
> + if (SCALAR_FLOAT_TYPE_P (elem_type)
> + && TYPE_PRECISION (elem_type) == 32)
> + return elements + 1;
> + else
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
>
>
>
--
Richard Guenther <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads
2012-06-13 9:32 ` Richard Guenther
@ 2012-06-13 11:46 ` William J. Schmidt
0 siblings, 0 replies; 7+ messages in thread
From: William J. Schmidt @ 2012-06-13 11:46 UTC (permalink / raw)
To: Richard Guenther; +Cc: gcc-patches, bergner
On Wed, 2012-06-13 at 11:26 +0200, Richard Guenther wrote:
> On Tue, 12 Jun 2012, William J. Schmidt wrote:
>
> > This patch is a follow-up to the discussion generated by
> > http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
> > vec_construct to the cost model for use in vect_model_load_cost, and
> > implemented a cost calculation that makes sense to me for PowerPC. I'm
> > less certain about the default, i386, and spu implementations. I took a
> > guess at i386 from the discussions we had, and used the same calculation
> > for the default and for spu. I'm hoping you or others can fill in the
> > blanks if I guessed badly.
> >
> > The i386 cost for vec_construct is different from all the others, which
> > are parameterized for each processor description. This should probably
> > be parameterized in some way as well, but thought you'd know better than
> > I how that should be. Perhaps instead of
> >
> > elements / 2 + 1
> >
> > it should be
> >
> > (elements / 2) * X + Y
> >
> > where X and Y are taken from the processor description, and represent
> > the cost of a merge and a permute, respectively. Let me know what you
> > think.
>
> Looks good to me with the gcc_asserts removed - TYPE_VECTOR_SUBPARTS
> might be 1 for V1TImode for example (heh, not that the vectorizer would
> vectorize to that). But I don't see any possible breakage with
> elements == 1, do you?
No, that was some unnecessary sanity testing I was doing for my own
curiosity. I'll pull them out and pop this in today. Thanks for the
review!
Bill
>
> Target maintainers can improve on the cost calculation if they wish,
> the default looks sensible to me.
>
> Thanks,
> Richard.
>
> > Thanks,
> > Bill
> >
> >
> > 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
> >
> > * targhooks.c (default_builtin_vectorized_conversion): Handle
> > vec_construct, using vectype to base cost on subparts.
> > * target.h (enum vect_cost_for_stmt): Add vec_construct.
> > * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
> > instead of scalar_to-vec.
> > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
> > vec_construct in same way as default for now.
> > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
> > Handle vec_construct, including special case for 32-bit loads.
> >
> >
> > Index: gcc/targhooks.c
> > ===================================================================
> > --- gcc/targhooks.c (revision 188482)
> > +++ gcc/targhooks.c (working copy)
> > @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
> >
> > int
> > default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > - tree vectype ATTRIBUTE_UNUSED,
> > + tree vectype,
> > int misalign ATTRIBUTE_UNUSED)
> > {
> > + unsigned elements;
> > +
> > switch (type_of_cost)
> > {
> > case scalar_stmt:
> > @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
> > case cond_branch_taken:
> > return 3;
> >
> > + case vec_construct:
> > + elements = TYPE_VECTOR_SUBPARTS (vectype);
> > + gcc_assert (elements > 1);
> > + return elements / 2 + 1;
> > +
> > default:
> > gcc_unreachable ();
> > }
> > Index: gcc/target.h
> > ===================================================================
> > --- gcc/target.h (revision 188482)
> > +++ gcc/target.h (working copy)
> > @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
> > cond_branch_not_taken,
> > cond_branch_taken,
> > vec_perm,
> > - vec_promote_demote
> > + vec_promote_demote,
> > + vec_construct
> > };
> >
> > /* The target structure. This holds all the backend hooks. */
> > Index: gcc/tree-vect-stmts.c
> > ===================================================================
> > --- gcc/tree-vect-stmts.c (revision 188482)
> > +++ gcc/tree-vect-stmts.c (working copy)
> > @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> > /* The loads themselves. */
> > if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> > {
> > - /* N scalar loads plus gathering them into a vector.
> > - ??? scalar_to_vec isn't the cost for that. */
> > + /* N scalar loads plus gathering them into a vector. */
> > + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
> > - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
> > - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
> > + * TYPE_VECTOR_SUBPARTS (vectype));
> > + inside_cost += ncopies
> > + * targetm.vectorize.builtin_vectorization_cost (vec_construct,
> > + vectype, 0);
> > }
> > else
> > vect_get_load_cost (first_dr, ncopies,
> > Index: gcc/config/spu/spu.c
> > ===================================================================
> > --- gcc/config/spu/spu.c (revision 188482)
> > +++ gcc/config/spu/spu.c (working copy)
> > @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
> > /* Implement targetm.vectorize.builtin_vectorization_cost. */
> > static int
> > spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > - tree vectype ATTRIBUTE_UNUSED,
> > + tree vectype,
> > int misalign ATTRIBUTE_UNUSED)
> > {
> > + unsigned elements;
> > +
> > switch (type_of_cost)
> > {
> > case scalar_stmt:
> > @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
> > case cond_branch_taken:
> > return 6;
> >
> > + case vec_construct:
> > + elements = TYPE_VECTOR_SUBPARTS (vectype);
> > + gcc_assert (elements > 1);
> > + return elements / 2 + 1;
> > +
> > default:
> > gcc_unreachable ();
> > }
> > Index: gcc/config/i386/i386.c
> > ===================================================================
> > --- gcc/config/i386/i386.c (revision 188482)
> > +++ gcc/config/i386/i386.c (working copy)
> > @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
> > /* Implement targetm.vectorize.builtin_vectorization_cost. */
> > static int
> > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > - tree vectype ATTRIBUTE_UNUSED,
> > + tree vectype,
> > int misalign ATTRIBUTE_UNUSED)
> > {
> > + unsigned elements;
> > +
> > switch (type_of_cost)
> > {
> > case scalar_stmt:
> > @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
> > case vec_promote_demote:
> > return ix86_cost->vec_stmt_cost;
> >
> > + case vec_construct:
> > + elements = TYPE_VECTOR_SUBPARTS (vectype);
> > + gcc_assert (elements > 1);
> > + return elements / 2 + 1;
> > +
> > default:
> > gcc_unreachable ();
> > }
> > Index: gcc/config/rs6000/rs6000.c
> > ===================================================================
> > --- gcc/config/rs6000/rs6000.c (revision 188482)
> > +++ gcc/config/rs6000/rs6000.c (working copy)
> > @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> > tree vectype, int misalign)
> > {
> > unsigned elements;
> > + tree elem_type;
> >
> > switch (type_of_cost)
> > {
> > @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> >
> > return 2;
> >
> > + case vec_construct:
> > + elements = TYPE_VECTOR_SUBPARTS (vectype);
> > + elem_type = TREE_TYPE (vectype);
> > + gcc_assert (elements > 1);
> > + /* 32-bit vectors loaded into registers are stored as double
> > + precision, so we need n/2 converts in addition to the usual
> > + n/2 merges to construct a vector of short floats from them. */
> > + if (SCALAR_FLOAT_TYPE_P (elem_type)
> > + && TYPE_PRECISION (elem_type) == 32)
> > + return elements + 1;
> > + else
> > + return elements / 2 + 1;
> > +
> > default:
> > gcc_unreachable ();
> > }
> >
> >
> >
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads
2012-06-13 4:32 [PATCH, RFC] First cut at using vec_construct for strided loads William J. Schmidt
2012-06-13 9:32 ` Richard Guenther
@ 2016-06-08 12:30 ` Richard Biener
2016-06-08 13:48 ` Bill Schmidt
1 sibling, 1 reply; 7+ messages in thread
From: Richard Biener @ 2016-06-08 12:30 UTC (permalink / raw)
To: William J. Schmidt, Uros Bizjak
Cc: GCC Patches, Richard Biener, Peter Bergner
On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> This patch is a follow-up to the discussion generated by
> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
> vec_construct to the cost model for use in vect_model_load_cost, and
> implemented a cost calculation that makes sense to me for PowerPC. I'm
> less certain about the default, i386, and spu implementations. I took a
> guess at i386 from the discussions we had, and used the same calculation
> for the default and for spu. I'm hoping you or others can fill in the
> blanks if I guessed badly.
>
> The i386 cost for vec_construct is different from all the others, which
> are parameterized for each processor description. This should probably
> be parameterized in some way as well, but thought you'd know better than
> I how that should be. Perhaps instead of
>
> elements / 2 + 1
>
> it should be
>
> (elements / 2) * X + Y
>
> where X and Y are taken from the processor description, and represent
> the cost of a merge and a permute, respectively. Let me know what you
> think.
Just trying to understand how you arrived at the above formulas in investigating
strangely low cost for v16qi construction of 9. If we pairwise reduce elements
with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
get with not accounting an initial move of element zero into a vector and then
inserting each other element into that with elements - 1 inserts.
This also matches up with code-generation on x86_64 for
vT foo (T a, T b, ...)
{
return (vT) {a, b, ... };
}
for any vector / element type combination I tried. Thus the patch below.
I'll bootstrap / test that on x86_64-linux and I'm leaving other
targets to target
maintainers.
Ok for the i386 parts?
Thanks,
Richard.
2016-06-08 Richard Biener <rguenther@suse.de>
* targhooks.c (default_builtin_vectorization_cost): Adjust
vec_construct cost.
* config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c (revision 237196)
+++ gcc/targhooks.c (working copy)
@@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
return 3;
case vec_construct:
- elements = TYPE_VECTOR_SUBPARTS (vectype);
- return elements / 2 + 1;
+ return TYPE_VECTOR_SUBPARTS (vectype) - 1;
default:
gcc_unreachable ();
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c (revision 237196)
+++ gcc/config/i386/i386.c (working copy)
@@ -49503,8 +49520,6 @@ static int
ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
tree vectype, int)
{
- unsigned elements;
-
switch (type_of_cost)
{
case scalar_stmt:
@@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
return ix86_cost->vec_stmt_cost;
case vec_construct:
- elements = TYPE_VECTOR_SUBPARTS (vectype);
- return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
+ return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
default:
gcc_unreachable ();
> Thanks,
> Bill
>
>
> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
>
> * targhooks.c (default_builtin_vectorized_conversion): Handle
> vec_construct, using vectype to base cost on subparts.
> * target.h (enum vect_cost_for_stmt): Add vec_construct.
> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
> instead of scalar_to-vec.
> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
> vec_construct in same way as default for now.
> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
> Handle vec_construct, including special case for 32-bit loads.
>
>
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c (revision 188482)
> +++ gcc/targhooks.c (working copy)
> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>
> int
> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> - tree vectype ATTRIBUTE_UNUSED,
> + tree vectype,
> int misalign ATTRIBUTE_UNUSED)
> {
> + unsigned elements;
> +
> switch (type_of_cost)
> {
> case scalar_stmt:
> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
> case cond_branch_taken:
> return 3;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + gcc_assert (elements > 1);
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
> Index: gcc/target.h
> ===================================================================
> --- gcc/target.h (revision 188482)
> +++ gcc/target.h (working copy)
> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
> cond_branch_not_taken,
> cond_branch_taken,
> vec_perm,
> - vec_promote_demote
> + vec_promote_demote,
> + vec_construct
> };
>
> /* The target structure. This holds all the backend hooks. */
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c (revision 188482)
> +++ gcc/tree-vect-stmts.c (working copy)
> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> /* The loads themselves. */
> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> {
> - /* N scalar loads plus gathering them into a vector.
> - ??? scalar_to_vec isn't the cost for that. */
> + /* N scalar loads plus gathering them into a vector. */
> + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
> + * TYPE_VECTOR_SUBPARTS (vectype));
> + inside_cost += ncopies
> + * targetm.vectorize.builtin_vectorization_cost (vec_construct,
> + vectype, 0);
> }
> else
> vect_get_load_cost (first_dr, ncopies,
> Index: gcc/config/spu/spu.c
> ===================================================================
> --- gcc/config/spu/spu.c (revision 188482)
> +++ gcc/config/spu/spu.c (working copy)
> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
> /* Implement targetm.vectorize.builtin_vectorization_cost. */
> static int
> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> - tree vectype ATTRIBUTE_UNUSED,
> + tree vectype,
> int misalign ATTRIBUTE_UNUSED)
> {
> + unsigned elements;
> +
> switch (type_of_cost)
> {
> case scalar_stmt:
> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
> case cond_branch_taken:
> return 6;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + gcc_assert (elements > 1);
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c (revision 188482)
> +++ gcc/config/i386/i386.c (working copy)
> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
> /* Implement targetm.vectorize.builtin_vectorization_cost. */
> static int
> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> - tree vectype ATTRIBUTE_UNUSED,
> + tree vectype,
> int misalign ATTRIBUTE_UNUSED)
> {
> + unsigned elements;
> +
> switch (type_of_cost)
> {
> case scalar_stmt:
> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
> case vec_promote_demote:
> return ix86_cost->vec_stmt_cost;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + gcc_assert (elements > 1);
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c (revision 188482)
> +++ gcc/config/rs6000/rs6000.c (working copy)
> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> tree vectype, int misalign)
> {
> unsigned elements;
> + tree elem_type;
>
> switch (type_of_cost)
> {
> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>
> return 2;
>
> + case vec_construct:
> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> + elem_type = TREE_TYPE (vectype);
> + gcc_assert (elements > 1);
> + /* 32-bit vectors loaded into registers are stored as double
> + precision, so we need n/2 converts in addition to the usual
> + n/2 merges to construct a vector of short floats from them. */
> + if (SCALAR_FLOAT_TYPE_P (elem_type)
> + && TYPE_PRECISION (elem_type) == 32)
> + return elements + 1;
> + else
> + return elements / 2 + 1;
> +
> default:
> gcc_unreachable ();
> }
>
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads
2016-06-08 12:30 ` Richard Biener
@ 2016-06-08 13:48 ` Bill Schmidt
2016-06-08 14:05 ` Richard Biener
0 siblings, 1 reply; 7+ messages in thread
From: Bill Schmidt @ 2016-06-08 13:48 UTC (permalink / raw)
To: Richard Biener; +Cc: Uros Bizjak, GCC Patches, Richard Biener, Peter Bergner
Hi Richard,
> On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>
> On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:
>> This patch is a follow-up to the discussion generated by
>> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
>> vec_construct to the cost model for use in vect_model_load_cost, and
>> implemented a cost calculation that makes sense to me for PowerPC. I'm
>> less certain about the default, i386, and spu implementations. I took a
>> guess at i386 from the discussions we had, and used the same calculation
>> for the default and for spu. I'm hoping you or others can fill in the
>> blanks if I guessed badly.
>>
>> The i386 cost for vec_construct is different from all the others, which
>> are parameterized for each processor description. This should probably
>> be parameterized in some way as well, but thought you'd know better than
>> I how that should be. Perhaps instead of
>>
>> elements / 2 + 1
>>
>> it should be
>>
>> (elements / 2) * X + Y
>>
>> where X and Y are taken from the processor description, and represent
>> the cost of a merge and a permute, respectively. Let me know what you
>> think.
>
> Just trying to understand how you arrived at the above formulas in investigating
> strangely low cost for v16qi construction of 9. If we pairwise reduce elements
> with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
> get with not accounting an initial move of element zero into a vector and then
> inserting each other element into that with elements - 1 inserts.
What I wrote there only makes partial sense for certain types on Power, so far as
I can tell, and even then it doesn’t generalize properly. When the scalar registers
are contained in the vector registers (as happens for floating-point on Power), then
you can do some merges and other forms of permutes to combine them faster
than doing specific inserts. But that isn’t a general solution even on Power; for the
integer modes we still do inserts.
So what you have makes sense to me, and what’s currently in place for Power needs
work also, so far as I can tell. I’ll take a note to revisit this.
— Bill
>
> This also matches up with code-generation on x86_64 for
>
> vT foo (T a, T b, ...)
> {
> return (vT) {a, b, ... };
> }
>
> for any vector / element type combination I tried. Thus the patch below.
>
> I'll bootstrap / test that on x86_64-linux and I'm leaving other
> targets to target
> maintainers.
>
> Ok for the i386 parts?
>
> Thanks,
> Richard.
>
> 2016-06-08 Richard Biener <rguenther@suse.de>
>
> * targhooks.c (default_builtin_vectorization_cost): Adjust
> vec_construct cost.
> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c (revision 237196)
> +++ gcc/targhooks.c (working copy)
> @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
> return 3;
>
> case vec_construct:
> - elements = TYPE_VECTOR_SUBPARTS (vectype);
> - return elements / 2 + 1;
> + return TYPE_VECTOR_SUBPARTS (vectype) - 1;
>
> default:
> gcc_unreachable ();
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c (revision 237196)
> +++ gcc/config/i386/i386.c (working copy)
> @@ -49503,8 +49520,6 @@ static int
> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> tree vectype, int)
> {
> - unsigned elements;
> -
> switch (type_of_cost)
> {
> case scalar_stmt:
> @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
> return ix86_cost->vec_stmt_cost;
>
> case vec_construct:
> - elements = TYPE_VECTOR_SUBPARTS (vectype);
> - return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
> + return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
>
> default:
> gcc_unreachable ();
>
>
>> Thanks,
>> Bill
>>
>>
>> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
>>
>> * targhooks.c (default_builtin_vectorized_conversion): Handle
>> vec_construct, using vectype to base cost on subparts.
>> * target.h (enum vect_cost_for_stmt): Add vec_construct.
>> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
>> instead of scalar_to-vec.
>> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
>> vec_construct in same way as default for now.
>> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
>> Handle vec_construct, including special case for 32-bit loads.
>>
>>
>> Index: gcc/targhooks.c
>> ===================================================================
>> --- gcc/targhooks.c (revision 188482)
>> +++ gcc/targhooks.c (working copy)
>> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>>
>> int
>> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>> - tree vectype ATTRIBUTE_UNUSED,
>> + tree vectype,
>> int misalign ATTRIBUTE_UNUSED)
>> {
>> + unsigned elements;
>> +
>> switch (type_of_cost)
>> {
>> case scalar_stmt:
>> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
>> case cond_branch_taken:
>> return 3;
>>
>> + case vec_construct:
>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>> + gcc_assert (elements > 1);
>> + return elements / 2 + 1;
>> +
>> default:
>> gcc_unreachable ();
>> }
>> Index: gcc/target.h
>> ===================================================================
>> --- gcc/target.h (revision 188482)
>> +++ gcc/target.h (working copy)
>> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
>> cond_branch_not_taken,
>> cond_branch_taken,
>> vec_perm,
>> - vec_promote_demote
>> + vec_promote_demote,
>> + vec_construct
>> };
>>
>> /* The target structure. This holds all the backend hooks. */
>> Index: gcc/tree-vect-stmts.c
>> ===================================================================
>> --- gcc/tree-vect-stmts.c (revision 188482)
>> +++ gcc/tree-vect-stmts.c (working copy)
>> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>> /* The loads themselves. */
>> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
>> {
>> - /* N scalar loads plus gathering them into a vector.
>> - ??? scalar_to_vec isn't the cost for that. */
>> + /* N scalar loads plus gathering them into a vector. */
>> + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
>> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
>> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
>> + * TYPE_VECTOR_SUBPARTS (vectype));
>> + inside_cost += ncopies
>> + * targetm.vectorize.builtin_vectorization_cost (vec_construct,
>> + vectype, 0);
>> }
>> else
>> vect_get_load_cost (first_dr, ncopies,
>> Index: gcc/config/spu/spu.c
>> ===================================================================
>> --- gcc/config/spu/spu.c (revision 188482)
>> +++ gcc/config/spu/spu.c (working copy)
>> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
>> /* Implement targetm.vectorize.builtin_vectorization_cost. */
>> static int
>> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>> - tree vectype ATTRIBUTE_UNUSED,
>> + tree vectype,
>> int misalign ATTRIBUTE_UNUSED)
>> {
>> + unsigned elements;
>> +
>> switch (type_of_cost)
>> {
>> case scalar_stmt:
>> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>> case cond_branch_taken:
>> return 6;
>>
>> + case vec_construct:
>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>> + gcc_assert (elements > 1);
>> + return elements / 2 + 1;
>> +
>> default:
>> gcc_unreachable ();
>> }
>> Index: gcc/config/i386/i386.c
>> ===================================================================
>> --- gcc/config/i386/i386.c (revision 188482)
>> +++ gcc/config/i386/i386.c (working copy)
>> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
>> /* Implement targetm.vectorize.builtin_vectorization_cost. */
>> static int
>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>> - tree vectype ATTRIBUTE_UNUSED,
>> + tree vectype,
>> int misalign ATTRIBUTE_UNUSED)
>> {
>> + unsigned elements;
>> +
>> switch (type_of_cost)
>> {
>> case scalar_stmt:
>> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>> case vec_promote_demote:
>> return ix86_cost->vec_stmt_cost;
>>
>> + case vec_construct:
>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>> + gcc_assert (elements > 1);
>> + return elements / 2 + 1;
>> +
>> default:
>> gcc_unreachable ();
>> }
>> Index: gcc/config/rs6000/rs6000.c
>> ===================================================================
>> --- gcc/config/rs6000/rs6000.c (revision 188482)
>> +++ gcc/config/rs6000/rs6000.c (working copy)
>> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>> tree vectype, int misalign)
>> {
>> unsigned elements;
>> + tree elem_type;
>>
>> switch (type_of_cost)
>> {
>> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>>
>> return 2;
>>
>> + case vec_construct:
>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>> + elem_type = TREE_TYPE (vectype);
>> + gcc_assert (elements > 1);
>> + /* 32-bit vectors loaded into registers are stored as double
>> + precision, so we need n/2 converts in addition to the usual
>> + n/2 merges to construct a vector of short floats from them. */
>> + if (SCALAR_FLOAT_TYPE_P (elem_type)
>> + && TYPE_PRECISION (elem_type) == 32)
>> + return elements + 1;
>> + else
>> + return elements / 2 + 1;
>> +
>> default:
>> gcc_unreachable ();
>> }
>>
>>
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads
2016-06-08 13:48 ` Bill Schmidt
@ 2016-06-08 14:05 ` Richard Biener
2016-06-09 15:27 ` Bill Schmidt
0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2016-06-08 14:05 UTC (permalink / raw)
To: Bill Schmidt; +Cc: Uros Bizjak, GCC Patches, Peter Bergner
[-- Attachment #1: Type: TEXT/PLAIN, Size: 11864 bytes --]
On Wed, 8 Jun 2016, Bill Schmidt wrote:
> Hi Richard,
>
> > On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
> > <wschmidt@linux.vnet.ibm.com> wrote:
> >> This patch is a follow-up to the discussion generated by
> >> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
> >> vec_construct to the cost model for use in vect_model_load_cost, and
> >> implemented a cost calculation that makes sense to me for PowerPC. I'm
> >> less certain about the default, i386, and spu implementations. I took a
> >> guess at i386 from the discussions we had, and used the same calculation
> >> for the default and for spu. I'm hoping you or others can fill in the
> >> blanks if I guessed badly.
> >>
> >> The i386 cost for vec_construct is different from all the others, which
> >> are parameterized for each processor description. This should probably
> >> be parameterized in some way as well, but thought you'd know better than
> >> I how that should be. Perhaps instead of
> >>
> >> elements / 2 + 1
> >>
> >> it should be
> >>
> >> (elements / 2) * X + Y
> >>
> >> where X and Y are taken from the processor description, and represent
> >> the cost of a merge and a permute, respectively. Let me know what you
> >> think.
> >
> > Just trying to understand how you arrived at the above formulas in investigating
> > strangely low cost for v16qi construction of 9. If we pairwise reduce elements
> > with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
> > get with not accounting an initial move of element zero into a vector and then
> > inserting each other element into that with elements - 1 inserts.
>
> What I wrote there only makes partial sense for certain types on Power, so far as
> I can tell, and even then it doesnât generalize properly. When the scalar registers
> are contained in the vector registers (as happens for floating-point on Power), then
> you can do some merges and other forms of permutes to combine them faster
> than doing specific inserts. But that isnât a general solution even on Power; for the
> integer modes we still do inserts.
You mean Power has instructions to combine more than two vector registers
into one? Otherwise you still need n / 2 plus n / 4 plus n / 8 ...
"permutes" which boils down to n - 1.
> So what you have makes sense to me, and whatâs currently in place for Power needs
> work also, so far as I can tell. Iâll take a note to revisit this.
Thanks.
Richard.
> â Bill
>
> >
> > This also matches up with code-generation on x86_64 for
> >
> > vT foo (T a, T b, ...)
> > {
> > return (vT) {a, b, ... };
> > }
> >
> > for any vector / element type combination I tried. Thus the patch below.
> >
> > I'll bootstrap / test that on x86_64-linux and I'm leaving other
> > targets to target
> > maintainers.
> >
> > Ok for the i386 parts?
> >
> > Thanks,
> > Richard.
> >
> > 2016-06-08 Richard Biener <rguenther@suse.de>
> >
> > * targhooks.c (default_builtin_vectorization_cost): Adjust
> > vec_construct cost.
> > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> >
> > Index: gcc/targhooks.c
> > ===================================================================
> > --- gcc/targhooks.c (revision 237196)
> > +++ gcc/targhooks.c (working copy)
> > @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
> > return 3;
> >
> > case vec_construct:
> > - elements = TYPE_VECTOR_SUBPARTS (vectype);
> > - return elements / 2 + 1;
> > + return TYPE_VECTOR_SUBPARTS (vectype) - 1;
> >
> > default:
> > gcc_unreachable ();
> > Index: gcc/config/i386/i386.c
> > ===================================================================
> > --- gcc/config/i386/i386.c (revision 237196)
> > +++ gcc/config/i386/i386.c (working copy)
> > @@ -49503,8 +49520,6 @@ static int
> > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > tree vectype, int)
> > {
> > - unsigned elements;
> > -
> > switch (type_of_cost)
> > {
> > case scalar_stmt:
> > @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
> > return ix86_cost->vec_stmt_cost;
> >
> > case vec_construct:
> > - elements = TYPE_VECTOR_SUBPARTS (vectype);
> > - return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
> > + return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
> >
> > default:
> > gcc_unreachable ();
> >
> >
> >> Thanks,
> >> Bill
> >>
> >>
> >> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
> >>
> >> * targhooks.c (default_builtin_vectorized_conversion): Handle
> >> vec_construct, using vectype to base cost on subparts.
> >> * target.h (enum vect_cost_for_stmt): Add vec_construct.
> >> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
> >> instead of scalar_to-vec.
> >> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
> >> vec_construct in same way as default for now.
> >> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> >> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
> >> Handle vec_construct, including special case for 32-bit loads.
> >>
> >>
> >> Index: gcc/targhooks.c
> >> ===================================================================
> >> --- gcc/targhooks.c (revision 188482)
> >> +++ gcc/targhooks.c (working copy)
> >> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
> >>
> >> int
> >> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> >> - tree vectype ATTRIBUTE_UNUSED,
> >> + tree vectype,
> >> int misalign ATTRIBUTE_UNUSED)
> >> {
> >> + unsigned elements;
> >> +
> >> switch (type_of_cost)
> >> {
> >> case scalar_stmt:
> >> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
> >> case cond_branch_taken:
> >> return 3;
> >>
> >> + case vec_construct:
> >> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> + gcc_assert (elements > 1);
> >> + return elements / 2 + 1;
> >> +
> >> default:
> >> gcc_unreachable ();
> >> }
> >> Index: gcc/target.h
> >> ===================================================================
> >> --- gcc/target.h (revision 188482)
> >> +++ gcc/target.h (working copy)
> >> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
> >> cond_branch_not_taken,
> >> cond_branch_taken,
> >> vec_perm,
> >> - vec_promote_demote
> >> + vec_promote_demote,
> >> + vec_construct
> >> };
> >>
> >> /* The target structure. This holds all the backend hooks. */
> >> Index: gcc/tree-vect-stmts.c
> >> ===================================================================
> >> --- gcc/tree-vect-stmts.c (revision 188482)
> >> +++ gcc/tree-vect-stmts.c (working copy)
> >> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> >> /* The loads themselves. */
> >> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> >> {
> >> - /* N scalar loads plus gathering them into a vector.
> >> - ??? scalar_to_vec isn't the cost for that. */
> >> + /* N scalar loads plus gathering them into a vector. */
> >> + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> >> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
> >> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
> >> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
> >> + * TYPE_VECTOR_SUBPARTS (vectype));
> >> + inside_cost += ncopies
> >> + * targetm.vectorize.builtin_vectorization_cost (vec_construct,
> >> + vectype, 0);
> >> }
> >> else
> >> vect_get_load_cost (first_dr, ncopies,
> >> Index: gcc/config/spu/spu.c
> >> ===================================================================
> >> --- gcc/config/spu/spu.c (revision 188482)
> >> +++ gcc/config/spu/spu.c (working copy)
> >> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
> >> /* Implement targetm.vectorize.builtin_vectorization_cost. */
> >> static int
> >> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> >> - tree vectype ATTRIBUTE_UNUSED,
> >> + tree vectype,
> >> int misalign ATTRIBUTE_UNUSED)
> >> {
> >> + unsigned elements;
> >> +
> >> switch (type_of_cost)
> >> {
> >> case scalar_stmt:
> >> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
> >> case cond_branch_taken:
> >> return 6;
> >>
> >> + case vec_construct:
> >> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> + gcc_assert (elements > 1);
> >> + return elements / 2 + 1;
> >> +
> >> default:
> >> gcc_unreachable ();
> >> }
> >> Index: gcc/config/i386/i386.c
> >> ===================================================================
> >> --- gcc/config/i386/i386.c (revision 188482)
> >> +++ gcc/config/i386/i386.c (working copy)
> >> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
> >> /* Implement targetm.vectorize.builtin_vectorization_cost. */
> >> static int
> >> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> >> - tree vectype ATTRIBUTE_UNUSED,
> >> + tree vectype,
> >> int misalign ATTRIBUTE_UNUSED)
> >> {
> >> + unsigned elements;
> >> +
> >> switch (type_of_cost)
> >> {
> >> case scalar_stmt:
> >> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
> >> case vec_promote_demote:
> >> return ix86_cost->vec_stmt_cost;
> >>
> >> + case vec_construct:
> >> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> + gcc_assert (elements > 1);
> >> + return elements / 2 + 1;
> >> +
> >> default:
> >> gcc_unreachable ();
> >> }
> >> Index: gcc/config/rs6000/rs6000.c
> >> ===================================================================
> >> --- gcc/config/rs6000/rs6000.c (revision 188482)
> >> +++ gcc/config/rs6000/rs6000.c (working copy)
> >> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> >> tree vectype, int misalign)
> >> {
> >> unsigned elements;
> >> + tree elem_type;
> >>
> >> switch (type_of_cost)
> >> {
> >> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> >>
> >> return 2;
> >>
> >> + case vec_construct:
> >> + elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> + elem_type = TREE_TYPE (vectype);
> >> + gcc_assert (elements > 1);
> >> + /* 32-bit vectors loaded into registers are stored as double
> >> + precision, so we need n/2 converts in addition to the usual
> >> + n/2 merges to construct a vector of short floats from them. */
> >> + if (SCALAR_FLOAT_TYPE_P (elem_type)
> >> + && TYPE_PRECISION (elem_type) == 32)
> >> + return elements + 1;
> >> + else
> >> + return elements / 2 + 1;
> >> +
> >> default:
> >> gcc_unreachable ();
> >> }
> >>
> >>
> >
>
>
--
Richard Biener <rguenther@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH, RFC] First cut at using vec_construct for strided loads
2016-06-08 14:05 ` Richard Biener
@ 2016-06-09 15:27 ` Bill Schmidt
0 siblings, 0 replies; 7+ messages in thread
From: Bill Schmidt @ 2016-06-09 15:27 UTC (permalink / raw)
To: Richard Biener; +Cc: Uros Bizjak, GCC Patches, Peter Bergner
> On Jun 8, 2016, at 9:05 AM, Richard Biener <rguenther@suse.de> wrote:
>
> On Wed, 8 Jun 2016, Bill Schmidt wrote:
>
>> Hi Richard,
>>
>>> On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>
>>> On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
>>> <wschmidt@linux.vnet.ibm.com> wrote:
>>>> This patch is a follow-up to the discussion generated by
>>>> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html. I've added
>>>> vec_construct to the cost model for use in vect_model_load_cost, and
>>>> implemented a cost calculation that makes sense to me for PowerPC. I'm
>>>> less certain about the default, i386, and spu implementations. I took a
>>>> guess at i386 from the discussions we had, and used the same calculation
>>>> for the default and for spu. I'm hoping you or others can fill in the
>>>> blanks if I guessed badly.
>>>>
>>>> The i386 cost for vec_construct is different from all the others, which
>>>> are parameterized for each processor description. This should probably
>>>> be parameterized in some way as well, but thought you'd know better than
>>>> I how that should be. Perhaps instead of
>>>>
>>>> elements / 2 + 1
>>>>
>>>> it should be
>>>>
>>>> (elements / 2) * X + Y
>>>>
>>>> where X and Y are taken from the processor description, and represent
>>>> the cost of a merge and a permute, respectively. Let me know what you
>>>> think.
>>>
>>> Just trying to understand how you arrived at the above formulas in investigating
>>> strangely low cost for v16qi construction of 9. If we pairwise reduce elements
>>> with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
>>> get with not accounting an initial move of element zero into a vector and then
>>> inserting each other element into that with elements - 1 inserts.
>>
>> What I wrote there only makes partial sense for certain types on Power, so far as
>> I can tell, and even then it doesn’t generalize properly. When the scalar registers
>> are contained in the vector registers (as happens for floating-point on Power), then
>> you can do some merges and other forms of permutes to combine them faster
>> than doing specific inserts. But that isn’t a general solution even on Power; for the
>> integer modes we still do inserts.
>
> You mean Power has instructions to combine more than two vector registers
> into one? Otherwise you still need n / 2 plus n / 4 plus n / 8 ...
> "permutes" which boils down to n - 1.
Right, we do not. This is what I meant about it not generalizing properly -- it was an
ill-thought-out, off-the-cuff remark, so far as I can tell. I agree with the n - 1, and I need
to go in and make a similar change for Power. There is a special case with 32-bit
floating-point that probably also needs adjustment.
Bill
>
>> So what you have makes sense to me, and what’s currently in place for Power needs
>> work also, so far as I can tell. I’ll take a note to revisit this.
>
> Thanks.
> Richard.
>
>> — Bill
>>
>>>
>>> This also matches up with code-generation on x86_64 for
>>>
>>> vT foo (T a, T b, ...)
>>> {
>>> return (vT) {a, b, ... };
>>> }
>>>
>>> for any vector / element type combination I tried. Thus the patch below.
>>>
>>> I'll bootstrap / test that on x86_64-linux and I'm leaving other
>>> targets to target
>>> maintainers.
>>>
>>> Ok for the i386 parts?
>>>
>>> Thanks,
>>> Richard.
>>>
>>> 2016-06-08 Richard Biener <rguenther@suse.de>
>>>
>>> * targhooks.c (default_builtin_vectorization_cost): Adjust
>>> vec_construct cost.
>>> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>>>
>>> Index: gcc/targhooks.c
>>> ===================================================================
>>> --- gcc/targhooks.c (revision 237196)
>>> +++ gcc/targhooks.c (working copy)
>>> @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
>>> return 3;
>>>
>>> case vec_construct:
>>> - elements = TYPE_VECTOR_SUBPARTS (vectype);
>>> - return elements / 2 + 1;
>>> + return TYPE_VECTOR_SUBPARTS (vectype) - 1;
>>>
>>> default:
>>> gcc_unreachable ();
>>> Index: gcc/config/i386/i386.c
>>> ===================================================================
>>> --- gcc/config/i386/i386.c (revision 237196)
>>> +++ gcc/config/i386/i386.c (working copy)
>>> @@ -49503,8 +49520,6 @@ static int
>>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>> tree vectype, int)
>>> {
>>> - unsigned elements;
>>> -
>>> switch (type_of_cost)
>>> {
>>> case scalar_stmt:
>>> @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
>>> return ix86_cost->vec_stmt_cost;
>>>
>>> case vec_construct:
>>> - elements = TYPE_VECTOR_SUBPARTS (vectype);
>>> - return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
>>> + return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
>>>
>>> default:
>>> gcc_unreachable ();
>>>
>>>
>>>> Thanks,
>>>> Bill
>>>>
>>>>
>>>> 2012-06-12 Bill Schmidt <wschmidt@linux.ibm.com>
>>>>
>>>> * targhooks.c (default_builtin_vectorized_conversion): Handle
>>>> vec_construct, using vectype to base cost on subparts.
>>>> * target.h (enum vect_cost_for_stmt): Add vec_construct.
>>>> * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
>>>> instead of scalar_to-vec.
>>>> * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
>>>> vec_construct in same way as default for now.
>>>> * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>>>> * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
>>>> Handle vec_construct, including special case for 32-bit loads.
>>>>
>>>>
>>>> Index: gcc/targhooks.c
>>>> ===================================================================
>>>> --- gcc/targhooks.c (revision 188482)
>>>> +++ gcc/targhooks.c (working copy)
>>>> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>>>>
>>>> int
>>>> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>>> - tree vectype ATTRIBUTE_UNUSED,
>>>> + tree vectype,
>>>> int misalign ATTRIBUTE_UNUSED)
>>>> {
>>>> + unsigned elements;
>>>> +
>>>> switch (type_of_cost)
>>>> {
>>>> case scalar_stmt:
>>>> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
>>>> case cond_branch_taken:
>>>> return 3;
>>>>
>>>> + case vec_construct:
>>>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> + gcc_assert (elements > 1);
>>>> + return elements / 2 + 1;
>>>> +
>>>> default:
>>>> gcc_unreachable ();
>>>> }
>>>> Index: gcc/target.h
>>>> ===================================================================
>>>> --- gcc/target.h (revision 188482)
>>>> +++ gcc/target.h (working copy)
>>>> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
>>>> cond_branch_not_taken,
>>>> cond_branch_taken,
>>>> vec_perm,
>>>> - vec_promote_demote
>>>> + vec_promote_demote,
>>>> + vec_construct
>>>> };
>>>>
>>>> /* The target structure. This holds all the backend hooks. */
>>>> Index: gcc/tree-vect-stmts.c
>>>> ===================================================================
>>>> --- gcc/tree-vect-stmts.c (revision 188482)
>>>> +++ gcc/tree-vect-stmts.c (working copy)
>>>> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>>>> /* The loads themselves. */
>>>> if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
>>>> {
>>>> - /* N scalar loads plus gathering them into a vector.
>>>> - ??? scalar_to_vec isn't the cost for that. */
>>>> + /* N scalar loads plus gathering them into a vector. */
>>>> + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>>>> inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
>>>> - * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
>>>> - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
>>>> + * TYPE_VECTOR_SUBPARTS (vectype));
>>>> + inside_cost += ncopies
>>>> + * targetm.vectorize.builtin_vectorization_cost (vec_construct,
>>>> + vectype, 0);
>>>> }
>>>> else
>>>> vect_get_load_cost (first_dr, ncopies,
>>>> Index: gcc/config/spu/spu.c
>>>> ===================================================================
>>>> --- gcc/config/spu/spu.c (revision 188482)
>>>> +++ gcc/config/spu/spu.c (working copy)
>>>> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
>>>> /* Implement targetm.vectorize.builtin_vectorization_cost. */
>>>> static int
>>>> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>>> - tree vectype ATTRIBUTE_UNUSED,
>>>> + tree vectype,
>>>> int misalign ATTRIBUTE_UNUSED)
>>>> {
>>>> + unsigned elements;
>>>> +
>>>> switch (type_of_cost)
>>>> {
>>>> case scalar_stmt:
>>>> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>>>> case cond_branch_taken:
>>>> return 6;
>>>>
>>>> + case vec_construct:
>>>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> + gcc_assert (elements > 1);
>>>> + return elements / 2 + 1;
>>>> +
>>>> default:
>>>> gcc_unreachable ();
>>>> }
>>>> Index: gcc/config/i386/i386.c
>>>> ===================================================================
>>>> --- gcc/config/i386/i386.c (revision 188482)
>>>> +++ gcc/config/i386/i386.c (working copy)
>>>> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
>>>> /* Implement targetm.vectorize.builtin_vectorization_cost. */
>>>> static int
>>>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>>> - tree vectype ATTRIBUTE_UNUSED,
>>>> + tree vectype,
>>>> int misalign ATTRIBUTE_UNUSED)
>>>> {
>>>> + unsigned elements;
>>>> +
>>>> switch (type_of_cost)
>>>> {
>>>> case scalar_stmt:
>>>> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>>>> case vec_promote_demote:
>>>> return ix86_cost->vec_stmt_cost;
>>>>
>>>> + case vec_construct:
>>>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> + gcc_assert (elements > 1);
>>>> + return elements / 2 + 1;
>>>> +
>>>> default:
>>>> gcc_unreachable ();
>>>> }
>>>> Index: gcc/config/rs6000/rs6000.c
>>>> ===================================================================
>>>> --- gcc/config/rs6000/rs6000.c (revision 188482)
>>>> +++ gcc/config/rs6000/rs6000.c (working copy)
>>>> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>>>> tree vectype, int misalign)
>>>> {
>>>> unsigned elements;
>>>> + tree elem_type;
>>>>
>>>> switch (type_of_cost)
>>>> {
>>>> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>>>>
>>>> return 2;
>>>>
>>>> + case vec_construct:
>>>> + elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> + elem_type = TREE_TYPE (vectype);
>>>> + gcc_assert (elements > 1);
>>>> + /* 32-bit vectors loaded into registers are stored as double
>>>> + precision, so we need n/2 converts in addition to the usual
>>>> + n/2 merges to construct a vector of short floats from them. */
>>>> + if (SCALAR_FLOAT_TYPE_P (elem_type)
>>>> + && TYPE_PRECISION (elem_type) == 32)
>>>> + return elements + 1;
>>>> + else
>>>> + return elements / 2 + 1;
>>>> +
>>>> default:
>>>> gcc_unreachable ();
>>>> }
>>>>
>>>>
>>>
>>
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2016-06-09 15:27 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-13 4:32 [PATCH, RFC] First cut at using vec_construct for strided loads William J. Schmidt
2012-06-13 9:32 ` Richard Guenther
2012-06-13 11:46 ` William J. Schmidt
2016-06-08 12:30 ` Richard Biener
2016-06-08 13:48 ` Bill Schmidt
2016-06-08 14:05 ` Richard Biener
2016-06-09 15:27 ` Bill Schmidt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).