public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery
@ 2023-10-19 13:29 Richard Biener
0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2023-10-19 13:29 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:8b771dfa1bbfff0c98df520cbaf2a3cbcabc7d6c
commit 8b771dfa1bbfff0c98df520cbaf2a3cbcabc7d6c
Author: Richard Biener <rguenther@suse.de>
Date: Fri Sep 29 13:13:16 2023 +0200
Avoid splitting store dataref groups during SLP discovery
The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating a VEC_PERM
SLP node merging them so the store will always cover the whole group.
I figured the patched function needs some refactoring so this is
in draft state indenting-wise. With this for example
int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
for (int i = 0; i < 256; i++)
{
x[4*i+0] = y[2*i+0];
x[4*i+1] = y[2*i+1];
x[4*i+2] = z[i];
x[4*i+3] = w[i];
}
}
which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark). We of course need a VF of four here.
.L2:
movdqa z(%rax), %xmm0
movdqa w(%rax), %xmm4
movdqa y(%rax,%rax), %xmm2
movdqa y+16(%rax,%rax), %xmm1
movdqa %xmm0, %xmm3
punpckhdq %xmm4, %xmm0
punpckldq %xmm4, %xmm3
movdqa %xmm2, %xmm4
shufps $238, %xmm3, %xmm2
movaps %xmm2, x+16(,%rax,4)
movdqa %xmm1, %xmm2
shufps $68, %xmm3, %xmm4
shufps $68, %xmm0, %xmm2
movaps %xmm4, x(,%rax,4)
shufps $238, %xmm0, %xmm1
movaps %xmm2, x+32(,%rax,4)
movaps %xmm1, x+48(,%rax,4)
addq $16, %rax
cmpq $1024, %rax
jne .L2
The extra permute nodes unfortunately sometimes do not behave
nicely wrt vect_is_simple_use since when the source is an
invariant or external there's no def stmt we can fake as
representative but vect_is_simple_use eventually gets the
caller the scalar operand and its definition. One might
argue using SLP_TREE_OPS and getting an external def would
maybe be more to the point, also since permute optimization
could change whether or not that appears.
* tree-vect-slp.cc (vect_build_slp_instance): Do not split
dataref groups on discovery failure.
Diff:
---
gcc/tree-vect-slp.cc | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 3 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 9e693f578c0c..b2eb2a1ff214 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3388,8 +3388,6 @@ vect_build_slp_instance (vec_info *vinfo,
else
{
/* Failed to SLP. */
- /* Free the allocated memory. */
- scalar_stmts.release ();
}
stmt_vec_info stmt_info = stmt_info_;
@@ -3408,6 +3406,8 @@ vect_build_slp_instance (vec_info *vinfo,
if (is_a <bb_vec_info> (vinfo)
&& (i > 1 && i < group_size))
{
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3454,7 +3454,10 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split into arbitrary pieces of size > 1. */
if (is_a <loop_vec_info> (vinfo)
- && (i > 1 && i < group_size)
+ && ((i > 1 && i < group_size)
+ /* For single-lane SLP when only the first lane didn't fail
+ also split to single-lanes. */
+ || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
&& !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
unsigned group1_size = i;
@@ -3463,6 +3466,164 @@ vect_build_slp_instance (vec_info *vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
+ if (param_vect_single_lane_slp != 0)
+ {
+ /* Analyze the stored values and pinch them together with
+ a permute node so we can preserve the whole store group. */
+ auto_vec<slp_tree> rhs_nodes;
+
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor = 1;
+
+ unsigned int start = 0, end = i;
+ while (start < group_size)
+ {
+ gcc_assert (end - start >= 1);
+ vec<stmt_vec_info> substmts;
+ substmts.create (end - start);
+ for (unsigned j = start; j < end; ++j)
+ substmts.quick_push (scalar_stmts[j]);
+ max_nunits = 1;
+ node = vect_build_slp_tree (vinfo, substmts, end - start,
+ &max_nunits,
+ matches, limit, &tree_size, bst_map);
+ if (node)
+ {
+ /* ??? Possibly not safe, but not sure how to check
+ and fail SLP build? */
+ unrolling_factor = force_common_multiple (unrolling_factor,
+ calculate_unrolling_factor
+ (max_nunits, end - start));
+ rhs_nodes.safe_push (node);
+ start = end;
+ end = group_size;
+ }
+ else
+ {
+ substmts.release ();
+ gcc_assert (end - start != 1);
+ /* ??? It really happens that we soft-fail SLP
+ build at a mismatch but the matching part hard-fails
+ later. As we know we arrived here with a group
+ larger than one try a group of size one! */
+ if (!matches[0])
+ end = start + 1;
+ else
+ for (unsigned j = start; j < end; j++)
+ if (!matches[j - start])
+ {
+ end = j;
+ break;
+ }
+ }
+ }
+ /* Now we assume we can build the root SLP node from all
+ stores. */
+ node = vect_create_new_slp_node (scalar_stmts, 1);
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+ VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (node).quick_push (perm);
+ SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+ SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+ SLP_TREE_LANES (perm) = group_size;
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (perm)
+ = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+ for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+ {
+ SLP_TREE_CHILDREN (perm)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (rhs_nodes[j])[0])++;
+ for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+ {
+ /* ??? We should populate SLP_TREE_SCALAR_STMTS
+ or SLP_TREE_SCALAR_OPS but then we might have
+ a mix of both in our children. */
+ SLP_TREE_LANE_PERMUTATION (perm)
+ .quick_push (std::make_pair (j, k));
+ }
+ vect_free_slp_tree (rhs_nodes[j]);
+ }
+
+ /* ??? Now we have a single permute node but when that's
+ fed more than two inputs it's prone to hit the limitation
+ on at most two sources for a VEC_PERM_EXPR. Ideally
+ we'd defer the following to the optimize-slp pass but
+ for now split it here.
+ ??? Optimally we'd produce permute nodes feeding in
+ the same number of lanes from each input and also have
+ the same vector type (only the width will eventually
+ differ here), for now just do "something". */
+ while (SLP_TREE_CHILDREN (perm).length () > 2)
+ {
+ slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
+ slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+ unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+ slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+ SLP_TREE_LANES (permab) = n;
+ SLP_TREE_LANE_PERMUTATION (permab).create (n);
+ SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (permab)
+ = SLP_TREE_REPRESENTATIVE (perm);
+ SLP_TREE_CHILDREN (permab).quick_push (a);
+ for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (0, k));
+ SLP_TREE_CHILDREN (permab).quick_push (b);
+ for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (1, k));
+ /* ??? Popluate SLP_TREE_SCALAR_STMTS/OPS of permab. */
+ SLP_TREE_CHILDREN (perm).quick_push (permab);
+ for (unsigned k = group_size - n; k < group_size; ++k)
+ SLP_TREE_LANE_PERMUTATION (perm)[k]
+ = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
+ k - (group_size - n));
+ }
+
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = kind;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+ return true;
+ }
+ else
+ {
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
/* Loop vectorization cannot handle gaps in stores, make sure
@@ -3479,11 +3640,15 @@ vect_build_slp_instance (vec_info *vinfo,
rest, kind, max_tree_size, limit);
return res;
+ }
}
/* Even though the first vector did not all match, we might be able to SLP
(some) of the remainder. FORNOW ignore this possibility. */
}
+ else
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
/* Failed to SLP. */
if (dump_enabled_p ())
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery
@ 2024-02-23 7:31 Richard Biener
0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2024-02-23 7:31 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:04db3df8cfcdb26291ca65ca4ec6f47e4356c260
commit 04db3df8cfcdb26291ca65ca4ec6f47e4356c260
Author: Richard Biener <rguenther@suse.de>
Date: Fri Sep 29 13:13:16 2023 +0200
Avoid splitting store dataref groups during SLP discovery
The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating a VEC_PERM
SLP node merging them so the store will always cover the whole group.
I figured the patched function needs some refactoring so this is
in draft state indenting-wise. With this for example
int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
for (int i = 0; i < 256; i++)
{
x[4*i+0] = y[2*i+0];
x[4*i+1] = y[2*i+1];
x[4*i+2] = z[i];
x[4*i+3] = w[i];
}
}
which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark). We of course need a VF of four here.
.L2:
movdqa z(%rax), %xmm0
movdqa w(%rax), %xmm4
movdqa y(%rax,%rax), %xmm2
movdqa y+16(%rax,%rax), %xmm1
movdqa %xmm0, %xmm3
punpckhdq %xmm4, %xmm0
punpckldq %xmm4, %xmm3
movdqa %xmm2, %xmm4
shufps $238, %xmm3, %xmm2
movaps %xmm2, x+16(,%rax,4)
movdqa %xmm1, %xmm2
shufps $68, %xmm3, %xmm4
shufps $68, %xmm0, %xmm2
movaps %xmm4, x(,%rax,4)
shufps $238, %xmm0, %xmm1
movaps %xmm2, x+32(,%rax,4)
movaps %xmm1, x+48(,%rax,4)
addq $16, %rax
cmpq $1024, %rax
jne .L2
The extra permute nodes unfortunately sometimes do not behave
nicely wrt vect_is_simple_use since when the source is an
invariant or external there's no def stmt we can fake as
representative but vect_is_simple_use eventually gets the
caller the scalar operand and its definition. One might
argue using SLP_TREE_OPS and getting an external def would
maybe be more to the point, also since permute optimization
could change whether or not that appears.
* tree-vect-slp.cc (vect_build_slp_instance): Do not split
dataref groups on discovery failure.
Diff:
---
gcc/tree-vect-slp.cc | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 3 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 47149df6cd71..a9236179e261 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3441,8 +3441,6 @@ vect_build_slp_instance (vec_info *vinfo,
else
{
/* Failed to SLP. */
- /* Free the allocated memory. */
- scalar_stmts.release ();
}
stmt_vec_info stmt_info = stmt_info_;
@@ -3461,6 +3459,8 @@ vect_build_slp_instance (vec_info *vinfo,
if (is_a <bb_vec_info> (vinfo)
&& (i > 1 && i < group_size))
{
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3507,7 +3507,10 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split into arbitrary pieces of size > 1. */
if (is_a <loop_vec_info> (vinfo)
- && (i > 1 && i < group_size)
+ && ((i > 1 && i < group_size)
+ /* For single-lane SLP when only the first lane didn't fail
+ also split to single-lanes. */
+ || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
&& !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
unsigned group1_size = i;
@@ -3516,6 +3519,164 @@ vect_build_slp_instance (vec_info *vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
+ if (param_vect_single_lane_slp != 0)
+ {
+ /* Analyze the stored values and pinch them together with
+ a permute node so we can preserve the whole store group. */
+ auto_vec<slp_tree> rhs_nodes;
+
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor = 1;
+
+ unsigned int start = 0, end = i;
+ while (start < group_size)
+ {
+ gcc_assert (end - start >= 1);
+ vec<stmt_vec_info> substmts;
+ substmts.create (end - start);
+ for (unsigned j = start; j < end; ++j)
+ substmts.quick_push (scalar_stmts[j]);
+ max_nunits = 1;
+ node = vect_build_slp_tree (vinfo, substmts, end - start,
+ &max_nunits,
+ matches, limit, &tree_size, bst_map);
+ if (node)
+ {
+ /* ??? Possibly not safe, but not sure how to check
+ and fail SLP build? */
+ unrolling_factor = force_common_multiple (unrolling_factor,
+ calculate_unrolling_factor
+ (max_nunits, end - start));
+ rhs_nodes.safe_push (node);
+ start = end;
+ end = group_size;
+ }
+ else
+ {
+ substmts.release ();
+ gcc_assert (end - start != 1);
+ /* ??? It really happens that we soft-fail SLP
+ build at a mismatch but the matching part hard-fails
+ later. As we know we arrived here with a group
+ larger than one try a group of size one! */
+ if (!matches[0])
+ end = start + 1;
+ else
+ for (unsigned j = start; j < end; j++)
+ if (!matches[j - start])
+ {
+ end = j;
+ break;
+ }
+ }
+ }
+ /* Now we assume we can build the root SLP node from all
+ stores. */
+ node = vect_create_new_slp_node (scalar_stmts, 1);
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+ VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (node).quick_push (perm);
+ SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+ SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+ SLP_TREE_LANES (perm) = group_size;
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (perm)
+ = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+ for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+ {
+ SLP_TREE_CHILDREN (perm)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (rhs_nodes[j])[0])++;
+ for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+ {
+ /* ??? We should populate SLP_TREE_SCALAR_STMTS
+ or SLP_TREE_SCALAR_OPS but then we might have
+ a mix of both in our children. */
+ SLP_TREE_LANE_PERMUTATION (perm)
+ .quick_push (std::make_pair (j, k));
+ }
+ vect_free_slp_tree (rhs_nodes[j]);
+ }
+
+ /* ??? Now we have a single permute node but when that's
+ fed more than two inputs it's prone to hit the limitation
+ on at most two sources for a VEC_PERM_EXPR. Ideally
+ we'd defer the following to the optimize-slp pass but
+ for now split it here.
+ ??? Optimally we'd produce permute nodes feeding in
+ the same number of lanes from each input and also have
+ the same vector type (only the width will eventually
+ differ here), for now just do "something". */
+ while (SLP_TREE_CHILDREN (perm).length () > 2)
+ {
+ slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
+ slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+ unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+ slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+ SLP_TREE_LANES (permab) = n;
+ SLP_TREE_LANE_PERMUTATION (permab).create (n);
+ SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (permab)
+ = SLP_TREE_REPRESENTATIVE (perm);
+ SLP_TREE_CHILDREN (permab).quick_push (a);
+ for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (0, k));
+ SLP_TREE_CHILDREN (permab).quick_push (b);
+ for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (1, k));
+ /* ??? Popluate SLP_TREE_SCALAR_STMTS/OPS of permab. */
+ SLP_TREE_CHILDREN (perm).quick_push (permab);
+ for (unsigned k = group_size - n; k < group_size; ++k)
+ SLP_TREE_LANE_PERMUTATION (perm)[k]
+ = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
+ k - (group_size - n));
+ }
+
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = kind;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+ return true;
+ }
+ else
+ {
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
/* Loop vectorization cannot handle gaps in stores, make sure
@@ -3532,11 +3693,15 @@ vect_build_slp_instance (vec_info *vinfo,
rest, kind, max_tree_size, limit);
return res;
+ }
}
/* Even though the first vector did not all match, we might be able to SLP
(some) of the remainder. FORNOW ignore this possibility. */
}
+ else
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
/* Failed to SLP. */
if (dump_enabled_p ())
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery
@ 2023-11-09 13:03 Richard Biener
0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2023-11-09 13:03 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:fdbab83136579a47e7703e36850714e19c291939
commit fdbab83136579a47e7703e36850714e19c291939
Author: Richard Biener <rguenther@suse.de>
Date: Fri Sep 29 13:13:16 2023 +0200
Avoid splitting store dataref groups during SLP discovery
The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating a VEC_PERM
SLP node merging them so the store will always cover the whole group.
I figured the patched function needs some refactoring so this is
in draft state indenting-wise. With this for example
int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
for (int i = 0; i < 256; i++)
{
x[4*i+0] = y[2*i+0];
x[4*i+1] = y[2*i+1];
x[4*i+2] = z[i];
x[4*i+3] = w[i];
}
}
which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark). We of course need a VF of four here.
.L2:
movdqa z(%rax), %xmm0
movdqa w(%rax), %xmm4
movdqa y(%rax,%rax), %xmm2
movdqa y+16(%rax,%rax), %xmm1
movdqa %xmm0, %xmm3
punpckhdq %xmm4, %xmm0
punpckldq %xmm4, %xmm3
movdqa %xmm2, %xmm4
shufps $238, %xmm3, %xmm2
movaps %xmm2, x+16(,%rax,4)
movdqa %xmm1, %xmm2
shufps $68, %xmm3, %xmm4
shufps $68, %xmm0, %xmm2
movaps %xmm4, x(,%rax,4)
shufps $238, %xmm0, %xmm1
movaps %xmm2, x+32(,%rax,4)
movaps %xmm1, x+48(,%rax,4)
addq $16, %rax
cmpq $1024, %rax
jne .L2
The extra permute nodes unfortunately sometimes do not behave
nicely wrt vect_is_simple_use since when the source is an
invariant or external there's no def stmt we can fake as
representative but vect_is_simple_use eventually gets the
caller the scalar operand and its definition. One might
argue using SLP_TREE_OPS and getting an external def would
maybe be more to the point, also since permute optimization
could change whether or not that appears.
* tree-vect-slp.cc (vect_build_slp_instance): Do not split
dataref groups on discovery failure.
Diff:
---
gcc/tree-vect-slp.cc | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 3 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index b3a2ec0ab87..47e216258d6 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3425,8 +3425,6 @@ vect_build_slp_instance (vec_info *vinfo,
else
{
/* Failed to SLP. */
- /* Free the allocated memory. */
- scalar_stmts.release ();
}
stmt_vec_info stmt_info = stmt_info_;
@@ -3445,6 +3443,8 @@ vect_build_slp_instance (vec_info *vinfo,
if (is_a <bb_vec_info> (vinfo)
&& (i > 1 && i < group_size))
{
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3491,7 +3491,10 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split into arbitrary pieces of size > 1. */
if (is_a <loop_vec_info> (vinfo)
- && (i > 1 && i < group_size)
+ && ((i > 1 && i < group_size)
+ /* For single-lane SLP when only the first lane didn't fail
+ also split to single-lanes. */
+ || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
&& !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
unsigned group1_size = i;
@@ -3500,6 +3503,164 @@ vect_build_slp_instance (vec_info *vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
+ if (param_vect_single_lane_slp != 0)
+ {
+ /* Analyze the stored values and pinch them together with
+ a permute node so we can preserve the whole store group. */
+ auto_vec<slp_tree> rhs_nodes;
+
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor = 1;
+
+ unsigned int start = 0, end = i;
+ while (start < group_size)
+ {
+ gcc_assert (end - start >= 1);
+ vec<stmt_vec_info> substmts;
+ substmts.create (end - start);
+ for (unsigned j = start; j < end; ++j)
+ substmts.quick_push (scalar_stmts[j]);
+ max_nunits = 1;
+ node = vect_build_slp_tree (vinfo, substmts, end - start,
+ &max_nunits,
+ matches, limit, &tree_size, bst_map);
+ if (node)
+ {
+ /* ??? Possibly not safe, but not sure how to check
+ and fail SLP build? */
+ unrolling_factor = force_common_multiple (unrolling_factor,
+ calculate_unrolling_factor
+ (max_nunits, end - start));
+ rhs_nodes.safe_push (node);
+ start = end;
+ end = group_size;
+ }
+ else
+ {
+ substmts.release ();
+ gcc_assert (end - start != 1);
+ /* ??? It really happens that we soft-fail SLP
+ build at a mismatch but the matching part hard-fails
+ later. As we know we arrived here with a group
+ larger than one try a group of size one! */
+ if (!matches[0])
+ end = start + 1;
+ else
+ for (unsigned j = start; j < end; j++)
+ if (!matches[j - start])
+ {
+ end = j;
+ break;
+ }
+ }
+ }
+ /* Now we assume we can build the root SLP node from all
+ stores. */
+ node = vect_create_new_slp_node (scalar_stmts, 1);
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+ VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (node).quick_push (perm);
+ SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+ SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+ SLP_TREE_LANES (perm) = group_size;
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (perm)
+ = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+ for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+ {
+ SLP_TREE_CHILDREN (perm)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (rhs_nodes[j])[0])++;
+ for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+ {
+ /* ??? We should populate SLP_TREE_SCALAR_STMTS
+ or SLP_TREE_SCALAR_OPS but then we might have
+ a mix of both in our children. */
+ SLP_TREE_LANE_PERMUTATION (perm)
+ .quick_push (std::make_pair (j, k));
+ }
+ vect_free_slp_tree (rhs_nodes[j]);
+ }
+
+ /* ??? Now we have a single permute node but when that's
+ fed more than two inputs it's prone to hit the limitation
+ on at most two sources for a VEC_PERM_EXPR. Ideally
+ we'd defer the following to the optimize-slp pass but
+ for now split it here.
+ ??? Optimally we'd produce permute nodes feeding in
+ the same number of lanes from each input and also have
+ the same vector type (only the width will eventually
+ differ here), for now just do "something". */
+ while (SLP_TREE_CHILDREN (perm).length () > 2)
+ {
+ slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
+ slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+ unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+ slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+ SLP_TREE_LANES (permab) = n;
+ SLP_TREE_LANE_PERMUTATION (permab).create (n);
+ SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (permab)
+ = SLP_TREE_REPRESENTATIVE (perm);
+ SLP_TREE_CHILDREN (permab).quick_push (a);
+ for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (0, k));
+ SLP_TREE_CHILDREN (permab).quick_push (b);
+ for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (1, k));
+ /* ??? Popluate SLP_TREE_SCALAR_STMTS/OPS of permab. */
+ SLP_TREE_CHILDREN (perm).quick_push (permab);
+ for (unsigned k = group_size - n; k < group_size; ++k)
+ SLP_TREE_LANE_PERMUTATION (perm)[k]
+ = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
+ k - (group_size - n));
+ }
+
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = kind;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+ return true;
+ }
+ else
+ {
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
/* Loop vectorization cannot handle gaps in stores, make sure
@@ -3516,11 +3677,15 @@ vect_build_slp_instance (vec_info *vinfo,
rest, kind, max_tree_size, limit);
return res;
+ }
}
/* Even though the first vector did not all match, we might be able to SLP
(some) of the remainder. FORNOW ignore this possibility. */
}
+ else
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
/* Failed to SLP. */
if (dump_enabled_p ())
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery
@ 2023-11-02 13:59 Richard Biener
0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2023-11-02 13:59 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:50f5fc2a8ceed0ebee78e290111a936ca101193b
commit 50f5fc2a8ceed0ebee78e290111a936ca101193b
Author: Richard Biener <rguenther@suse.de>
Date: Fri Sep 29 13:13:16 2023 +0200
Avoid splitting store dataref groups during SLP discovery
The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating a VEC_PERM
SLP node merging them so the store will always cover the whole group.
I figured the patched function needs some refactoring so this is
in draft state indenting-wise. With this for example
int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
for (int i = 0; i < 256; i++)
{
x[4*i+0] = y[2*i+0];
x[4*i+1] = y[2*i+1];
x[4*i+2] = z[i];
x[4*i+3] = w[i];
}
}
which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark). We of course need a VF of four here.
.L2:
movdqa z(%rax), %xmm0
movdqa w(%rax), %xmm4
movdqa y(%rax,%rax), %xmm2
movdqa y+16(%rax,%rax), %xmm1
movdqa %xmm0, %xmm3
punpckhdq %xmm4, %xmm0
punpckldq %xmm4, %xmm3
movdqa %xmm2, %xmm4
shufps $238, %xmm3, %xmm2
movaps %xmm2, x+16(,%rax,4)
movdqa %xmm1, %xmm2
shufps $68, %xmm3, %xmm4
shufps $68, %xmm0, %xmm2
movaps %xmm4, x(,%rax,4)
shufps $238, %xmm0, %xmm1
movaps %xmm2, x+32(,%rax,4)
movaps %xmm1, x+48(,%rax,4)
addq $16, %rax
cmpq $1024, %rax
jne .L2
The extra permute nodes unfortunately sometimes do not behave
nicely wrt vect_is_simple_use since when the source is an
invariant or external there's no def stmt we can fake as
representative but vect_is_simple_use eventually gets the
caller the scalar operand and its definition. One might
argue using SLP_TREE_OPS and getting an external def would
maybe be more to the point, also since permute optimization
could change whether or not that appears.
* tree-vect-slp.cc (vect_build_slp_instance): Do not split
dataref groups on discovery failure.
Diff:
---
gcc/tree-vect-slp.cc | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 3 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index acc5c3865f9d..fd556fe17b83 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3425,8 +3425,6 @@ vect_build_slp_instance (vec_info *vinfo,
else
{
/* Failed to SLP. */
- /* Free the allocated memory. */
- scalar_stmts.release ();
}
stmt_vec_info stmt_info = stmt_info_;
@@ -3445,6 +3443,8 @@ vect_build_slp_instance (vec_info *vinfo,
if (is_a <bb_vec_info> (vinfo)
&& (i > 1 && i < group_size))
{
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3491,7 +3491,10 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split into arbitrary pieces of size > 1. */
if (is_a <loop_vec_info> (vinfo)
- && (i > 1 && i < group_size)
+ && ((i > 1 && i < group_size)
+ /* For single-lane SLP when only the first lane didn't fail
+ also split to single-lanes. */
+ || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
&& !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
unsigned group1_size = i;
@@ -3500,6 +3503,164 @@ vect_build_slp_instance (vec_info *vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
+ if (param_vect_single_lane_slp != 0)
+ {
+ /* Analyze the stored values and pinch them together with
+ a permute node so we can preserve the whole store group. */
+ auto_vec<slp_tree> rhs_nodes;
+
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor = 1;
+
+ unsigned int start = 0, end = i;
+ while (start < group_size)
+ {
+ gcc_assert (end - start >= 1);
+ vec<stmt_vec_info> substmts;
+ substmts.create (end - start);
+ for (unsigned j = start; j < end; ++j)
+ substmts.quick_push (scalar_stmts[j]);
+ max_nunits = 1;
+ node = vect_build_slp_tree (vinfo, substmts, end - start,
+ &max_nunits,
+ matches, limit, &tree_size, bst_map);
+ if (node)
+ {
+ /* ??? Possibly not safe, but not sure how to check
+ and fail SLP build? */
+ unrolling_factor = force_common_multiple (unrolling_factor,
+ calculate_unrolling_factor
+ (max_nunits, end - start));
+ rhs_nodes.safe_push (node);
+ start = end;
+ end = group_size;
+ }
+ else
+ {
+ substmts.release ();
+ gcc_assert (end - start != 1);
+ /* ??? It really happens that we soft-fail SLP
+ build at a mismatch but the matching part hard-fails
+ later. As we know we arrived here with a group
+ larger than one try a group of size one! */
+ if (!matches[0])
+ end = start + 1;
+ else
+ for (unsigned j = start; j < end; j++)
+ if (!matches[j - start])
+ {
+ end = j;
+ break;
+ }
+ }
+ }
+ /* Now we assume we can build the root SLP node from all
+ stores. */
+ node = vect_create_new_slp_node (scalar_stmts, 1);
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+ VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (node).quick_push (perm);
+ SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+ SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+ SLP_TREE_LANES (perm) = group_size;
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (perm)
+ = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+ for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+ {
+ SLP_TREE_CHILDREN (perm)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (rhs_nodes[j])[0])++;
+ for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+ {
+ /* ??? We should populate SLP_TREE_SCALAR_STMTS
+ or SLP_TREE_SCALAR_OPS but then we might have
+ a mix of both in our children. */
+ SLP_TREE_LANE_PERMUTATION (perm)
+ .quick_push (std::make_pair (j, k));
+ }
+ vect_free_slp_tree (rhs_nodes[j]);
+ }
+
+ /* ??? Now we have a single permute node but when that's
+ fed more than two inputs it's prone to hit the limitation
+ on at most two sources for a VEC_PERM_EXPR. Ideally
+ we'd defer the following to the optimize-slp pass but
+ for now split it here.
+ ??? Optimally we'd produce permute nodes feeding in
+ the same number of lanes from each input and also have
+ the same vector type (only the width will eventually
+ differ here), for now just do "something". */
+ while (SLP_TREE_CHILDREN (perm).length () > 2)
+ {
+ slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
+ slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+ unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+ slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+ SLP_TREE_LANES (permab) = n;
+ SLP_TREE_LANE_PERMUTATION (permab).create (n);
+ SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (permab)
+ = SLP_TREE_REPRESENTATIVE (perm);
+ SLP_TREE_CHILDREN (permab).quick_push (a);
+ for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (0, k));
+ SLP_TREE_CHILDREN (permab).quick_push (b);
+ for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (1, k));
+ /* ??? Popluate SLP_TREE_SCALAR_STMTS/OPS of permab. */
+ SLP_TREE_CHILDREN (perm).quick_push (permab);
+ for (unsigned k = group_size - n; k < group_size; ++k)
+ SLP_TREE_LANE_PERMUTATION (perm)[k]
+ = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
+ k - (group_size - n));
+ }
+
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = kind;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+ return true;
+ }
+ else
+ {
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
/* Loop vectorization cannot handle gaps in stores, make sure
@@ -3516,11 +3677,15 @@ vect_build_slp_instance (vec_info *vinfo,
rest, kind, max_tree_size, limit);
return res;
+ }
}
/* Even though the first vector did not all match, we might be able to SLP
(some) of the remainder. FORNOW ignore this possibility. */
}
+ else
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
/* Failed to SLP. */
if (dump_enabled_p ())
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery
@ 2023-10-16 12:50 Richard Biener
0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2023-10-16 12:50 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:2f4a092228dcb79966f5aea8a192f8b6a6ed6979
commit 2f4a092228dcb79966f5aea8a192f8b6a6ed6979
Author: Richard Biener <rguenther@suse.de>
Date: Fri Sep 29 13:13:16 2023 +0200
Avoid splitting store dataref groups during SLP discovery
The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating a VEC_PERM
SLP node merging them so the store will always cover the whole group.
I figured the patched function needs some refactoring so this is
in draft state indenting-wise. With this for example
int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
for (int i = 0; i < 256; i++)
{
x[4*i+0] = y[2*i+0];
x[4*i+1] = y[2*i+1];
x[4*i+2] = z[i];
x[4*i+3] = w[i];
}
}
which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark). We of course need a VF of four here.
.L2:
movdqa z(%rax), %xmm0
movdqa w(%rax), %xmm4
movdqa y(%rax,%rax), %xmm2
movdqa y+16(%rax,%rax), %xmm1
movdqa %xmm0, %xmm3
punpckhdq %xmm4, %xmm0
punpckldq %xmm4, %xmm3
movdqa %xmm2, %xmm4
shufps $238, %xmm3, %xmm2
movaps %xmm2, x+16(,%rax,4)
movdqa %xmm1, %xmm2
shufps $68, %xmm3, %xmm4
shufps $68, %xmm0, %xmm2
movaps %xmm4, x(,%rax,4)
shufps $238, %xmm0, %xmm1
movaps %xmm2, x+32(,%rax,4)
movaps %xmm1, x+48(,%rax,4)
addq $16, %rax
cmpq $1024, %rax
jne .L2
The extra permute nodes unfortunately sometimes do not behave
nicely wrt vect_is_simple_use since when the source is an
invariant or external there's no def stmt we can fake as
representative but vect_is_simple_use eventually gets the
caller the scalar operand and its definition. One might
argue using SLP_TREE_OPS and getting an external def would
maybe be more to the point, also since permute optimization
could change whether or not that appears.
* tree-vect-slp.cc (vect_build_slp_instance): Do not split
dataref groups on discovery failure.
Diff:
---
gcc/tree-vect-slp.cc | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 3 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 97bd2f3feea2..42a66992a633 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3349,8 +3349,6 @@ vect_build_slp_instance (vec_info *vinfo,
else
{
/* Failed to SLP. */
- /* Free the allocated memory. */
- scalar_stmts.release ();
}
stmt_vec_info stmt_info = stmt_info_;
@@ -3369,6 +3367,8 @@ vect_build_slp_instance (vec_info *vinfo,
if (is_a <bb_vec_info> (vinfo)
&& (i > 1 && i < group_size))
{
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3415,7 +3415,10 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split into arbitrary pieces of size > 1. */
if (is_a <loop_vec_info> (vinfo)
- && (i > 1 && i < group_size)
+ && ((i > 1 && i < group_size)
+ /* For single-lane SLP when only the first lane didn't fail
+ also split to single-lanes. */
+ || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
&& !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
unsigned group1_size = i;
@@ -3424,6 +3427,164 @@ vect_build_slp_instance (vec_info *vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
+ if (param_vect_single_lane_slp != 0)
+ {
+ /* Analyze the stored values and pinch them together with
+ a permute node so we can preserve the whole store group. */
+ auto_vec<slp_tree> rhs_nodes;
+
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor = 1;
+
+ unsigned int start = 0, end = i;
+ while (start < group_size)
+ {
+ gcc_assert (end - start >= 1);
+ vec<stmt_vec_info> substmts;
+ substmts.create (end - start);
+ for (unsigned j = start; j < end; ++j)
+ substmts.quick_push (scalar_stmts[j]);
+ max_nunits = 1;
+ node = vect_build_slp_tree (vinfo, substmts, end - start,
+ &max_nunits,
+ matches, limit, &tree_size, bst_map);
+ if (node)
+ {
+ /* ??? Possibly not safe, but not sure how to check
+ and fail SLP build? */
+ unrolling_factor = force_common_multiple (unrolling_factor,
+ calculate_unrolling_factor
+ (max_nunits, end - start));
+ rhs_nodes.safe_push (node);
+ start = end;
+ end = group_size;
+ }
+ else
+ {
+ substmts.release ();
+ gcc_assert (end - start != 1);
+ /* ??? It really happens that we soft-fail SLP
+ build at a mismatch but the matching part hard-fails
+ later. As we know we arrived here with a group
+ larger than one try a group of size one! */
+ if (!matches[0])
+ end = start + 1;
+ else
+ for (unsigned j = start; j < end; j++)
+ if (!matches[j - start])
+ {
+ end = j;
+ break;
+ }
+ }
+ }
+ /* Now we assume we can build the root SLP node from all
+ stores. */
+ node = vect_create_new_slp_node (scalar_stmts, 1);
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+ VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (node).quick_push (perm);
+ SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+ SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+ SLP_TREE_LANES (perm) = group_size;
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (perm)
+ = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+ for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+ {
+ SLP_TREE_CHILDREN (perm)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (rhs_nodes[j])[0])++;
+ for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+ {
+ /* ??? We should populate SLP_TREE_SCALAR_STMTS
+ or SLP_TREE_SCALAR_OPS but then we might have
+ a mix of both in our children. */
+ SLP_TREE_LANE_PERMUTATION (perm)
+ .quick_push (std::make_pair (j, k));
+ }
+ vect_free_slp_tree (rhs_nodes[j]);
+ }
+
+ /* ??? Now we have a single permute node but when that's
+ fed more than two inputs it's prone to hit the limitation
+ on at most two sources for a VEC_PERM_EXPR. Ideally
+ we'd defer the following to the optimize-slp pass but
+ for now split it here.
+ ??? Optimally we'd produce permute nodes feeding in
+ the same number of lanes from each input and also have
+ the same vector type (only the width will eventually
+ differ here), for now just do "something". */
+ while (SLP_TREE_CHILDREN (perm).length () > 2)
+ {
+ slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
+ slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+ unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+ slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+ SLP_TREE_LANES (permab) = n;
+ SLP_TREE_LANE_PERMUTATION (permab).create (n);
+ SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (permab)
+ = SLP_TREE_REPRESENTATIVE (perm);
+ SLP_TREE_CHILDREN (permab).quick_push (a);
+ for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (0, k));
+ SLP_TREE_CHILDREN (permab).quick_push (b);
+ for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (1, k));
+ /* ??? Popluate SLP_TREE_SCALAR_STMTS/OPS of permab. */
+ SLP_TREE_CHILDREN (perm).quick_push (permab);
+ for (unsigned k = group_size - n; k < group_size; ++k)
+ SLP_TREE_LANE_PERMUTATION (perm)[k]
+ = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
+ k - (group_size - n));
+ }
+
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = kind;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+ return true;
+ }
+ else
+ {
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
/* Loop vectorization cannot handle gaps in stores, make sure
@@ -3440,11 +3601,15 @@ vect_build_slp_instance (vec_info *vinfo,
rest, kind, max_tree_size, limit);
return res;
+ }
}
/* Even though the first vector did not all match, we might be able to SLP
(some) of the remainder. FORNOW ignore this possibility. */
}
+ else
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
/* Failed to SLP. */
if (dump_enabled_p ())
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery
@ 2023-10-06 7:07 Richard Biener
0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2023-10-06 7:07 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:d4d841cb500924177f261c987993b3968fce9391
commit d4d841cb500924177f261c987993b3968fce9391
Author: Richard Biener <rguenther@suse.de>
Date: Fri Sep 29 13:13:16 2023 +0200
Avoid splitting store dataref groups during SLP discovery
The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating a VEC_PERM
SLP node merging them so the store will always cover the whole group.
I figured the patched function needs some refactoring so this is
in draft state indenting-wise. With this for example
int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
for (int i = 0; i < 256; i++)
{
x[4*i+0] = y[2*i+0];
x[4*i+1] = y[2*i+1];
x[4*i+2] = z[i];
x[4*i+3] = w[i];
}
}
which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark). We of course need a VF of four here.
.L2:
movdqa z(%rax), %xmm0
movdqa w(%rax), %xmm4
movdqa y(%rax,%rax), %xmm2
movdqa y+16(%rax,%rax), %xmm1
movdqa %xmm0, %xmm3
punpckhdq %xmm4, %xmm0
punpckldq %xmm4, %xmm3
movdqa %xmm2, %xmm4
shufps $238, %xmm3, %xmm2
movaps %xmm2, x+16(,%rax,4)
movdqa %xmm1, %xmm2
shufps $68, %xmm3, %xmm4
shufps $68, %xmm0, %xmm2
movaps %xmm4, x(,%rax,4)
shufps $238, %xmm0, %xmm1
movaps %xmm2, x+32(,%rax,4)
movaps %xmm1, x+48(,%rax,4)
addq $16, %rax
cmpq $1024, %rax
jne .L2
The extra permute nodes unfortunately sometimes do not behave
nicely wrt vect_is_simple_use since when the source is an
invariant or external there's no def stmt we can fake as
representative but vect_is_simple_use eventually gets the
caller the scalar operand and its definition. One might
argue using SLP_TREE_OPS and getting an external def would
maybe be more to the point, also since permute optimization
could change whether or not that appears.
* tree-vect-slp.cc (vect_build_slp_instance): Do not split
dataref groups on discovery failure.
Diff:
---
gcc/tree-vect-slp.cc | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 3 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 281938c6aa1..08e8418b33e 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3331,8 +3331,6 @@ vect_build_slp_instance (vec_info *vinfo,
else
{
/* Failed to SLP. */
- /* Free the allocated memory. */
- scalar_stmts.release ();
}
stmt_vec_info stmt_info = stmt_info_;
@@ -3351,6 +3349,8 @@ vect_build_slp_instance (vec_info *vinfo,
if (is_a <bb_vec_info> (vinfo)
&& (i > 1 && i < group_size))
{
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3397,7 +3397,10 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split into arbitrary pieces of size > 1. */
if (is_a <loop_vec_info> (vinfo)
- && (i > 1 && i < group_size)
+ && ((i > 1 && i < group_size)
+ /* For single-lane SLP when only the first lane didn't fail
+ also split to single-lanes. */
+ || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
&& !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
unsigned group1_size = i;
@@ -3406,6 +3409,164 @@ vect_build_slp_instance (vec_info *vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
+ if (param_vect_single_lane_slp != 0)
+ {
+ /* Analyze the stored values and pinch them together with
+ a permute node so we can preserve the whole store group. */
+ auto_vec<slp_tree> rhs_nodes;
+
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor = 1;
+
+ unsigned int start = 0, end = i;
+ while (start < group_size)
+ {
+ gcc_assert (end - start >= 1);
+ vec<stmt_vec_info> substmts;
+ substmts.create (end - start);
+ for (unsigned j = start; j < end; ++j)
+ substmts.quick_push (scalar_stmts[j]);
+ max_nunits = 1;
+ node = vect_build_slp_tree (vinfo, substmts, end - start,
+ &max_nunits,
+ matches, limit, &tree_size, bst_map);
+ if (node)
+ {
+ /* ??? Possibly not safe, but not sure how to check
+ and fail SLP build? */
+ unrolling_factor = force_common_multiple (unrolling_factor,
+ calculate_unrolling_factor
+ (max_nunits, end - start));
+ rhs_nodes.safe_push (node);
+ start = end;
+ end = group_size;
+ }
+ else
+ {
+ substmts.release ();
+ gcc_assert (end - start != 1);
+ /* ??? It really happens that we soft-fail SLP
+ build at a mismatch but the matching part hard-fails
+ later. As we know we arrived here with a group
+ larger than one try a group of size one! */
+ if (!matches[0])
+ end = start + 1;
+ else
+ for (unsigned j = start; j < end; j++)
+ if (!matches[j - start])
+ {
+ end = j;
+ break;
+ }
+ }
+ }
+ /* Now we assume we can build the root SLP node from all
+ stores. */
+ node = vect_create_new_slp_node (scalar_stmts, 1);
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+ VEC_PERM_EXPR);
+ SLP_TREE_CHILDREN (node).quick_push (perm);
+ SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+ SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+ SLP_TREE_LANES (perm) = group_size;
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (perm)
+ = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+ for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+ {
+ SLP_TREE_CHILDREN (perm)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (rhs_nodes[j])[0])++;
+ for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+ {
+ /* ??? We should populate SLP_TREE_SCALAR_STMTS
+ or SLP_TREE_SCALAR_OPS but then we might have
+ a mix of both in our children. */
+ SLP_TREE_LANE_PERMUTATION (perm)
+ .quick_push (std::make_pair (j, k));
+ }
+ vect_free_slp_tree (rhs_nodes[j]);
+ }
+
+ /* ??? Now we have a single permute node but when that's
+ fed more than two inputs it's prone to hit the limitation
+ on at most two sources for a VEC_PERM_EXPR. Ideally
+ we'd defer the following to the optimize-slp pass but
+ for now split it here.
+ ??? Optimally we'd produce permute nodes feeding in
+ the same number of lanes from each input and also have
+ the same vector type (only the width will eventually
+ differ here), for now just do "something". */
+ while (SLP_TREE_CHILDREN (perm).length () > 2)
+ {
+ slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
+ slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+ unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+ slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+ SLP_TREE_LANES (permab) = n;
+ SLP_TREE_LANE_PERMUTATION (permab).create (n);
+ SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+ /* ??? We should set this NULL but that's not expected. */
+ SLP_TREE_REPRESENTATIVE (permab)
+ = SLP_TREE_REPRESENTATIVE (perm);
+ SLP_TREE_CHILDREN (permab).quick_push (a);
+ for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (0, k));
+ SLP_TREE_CHILDREN (permab).quick_push (b);
+ for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+ SLP_TREE_LANE_PERMUTATION (permab)
+ .quick_push (std::make_pair (1, k));
+ /* ??? Popluate SLP_TREE_SCALAR_STMTS/OPS of permab. */
+ SLP_TREE_CHILDREN (perm).quick_push (permab);
+ for (unsigned k = group_size - n; k < group_size; ++k)
+ SLP_TREE_LANE_PERMUTATION (perm)[k]
+ = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
+ k - (group_size - n));
+ }
+
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = kind;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+ return true;
+ }
+ else
+ {
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
/* Loop vectorization cannot handle gaps in stores, make sure
@@ -3422,11 +3583,15 @@ vect_build_slp_instance (vec_info *vinfo,
rest, kind, max_tree_size, limit);
return res;
+ }
}
/* Even though the first vector did not all match, we might be able to SLP
(some) of the remainder. FORNOW ignore this possibility. */
}
+ else
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
/* Failed to SLP. */
if (dump_enabled_p ())
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-02-23 7:31 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-19 13:29 [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery Richard Biener
-- strict thread matches above, loose matches on Subject: below --
2024-02-23 7:31 Richard Biener
2023-11-09 13:03 Richard Biener
2023-11-02 13:59 Richard Biener
2023-10-16 12:50 Richard Biener
2023-10-06 7:07 Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).