* Avoid unnecessary peeling for gaps with LD3
@ 2016-05-20 15:27 Richard Sandiford
2016-05-23 8:29 ` Richard Biener
0 siblings, 1 reply; 2+ messages in thread
From: Richard Sandiford @ 2016-05-20 15:27 UTC (permalink / raw)
To: gcc-patches
vectorizable_load forces peeling for gaps if the vectorisation factor
is not a multiple of the group size, since in that case we'd normally load
beyond the original scalar accesses but drop the excess elements as part
of a following permute:
if (loop_vinfo
&& ! STMT_VINFO_STRIDED_P (stmt_info)
&& (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
|| (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
This isn't necessary for LOAD_LANES though, since it loads only the
data needed and does the permute itself.
Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Thanks,
Richard
gcc/
* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
load_lanes/grouped_load classification comes first. Don't check
whether the vectorization factor is a multiple of the group size
for load_lanes.
gcc/testsuite/
* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c
+++ gcc/tree-vect-stmts.c
@@ -6314,6 +6314,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+ group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+
+ if (!slp
+ && !PURE_SLP_STMT (stmt_info)
+ && !STMT_VINFO_STRIDED_P (stmt_info))
+ {
+ if (vect_load_lanes_supported (vectype, group_size))
+ load_lanes_p = true;
+ else if (!vect_grouped_load_supported (vectype, group_size))
+ return false;
+ }
/* If this is single-element interleaving with an element distance
that leaves unused vector loads around punt - we at least create
@@ -6341,7 +6352,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (loop_vinfo
&& ! STMT_VINFO_STRIDED_P (stmt_info)
&& (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
- || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
+ || (!slp && !load_lanes_p && vf % group_size != 0)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6361,8 +6372,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
slp_perm = true;
- group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-
/* ??? The following is overly pessimistic (as well as the loop
case above) in the case we can statically determine the excess
elements loaded are within the bounds of a decl that is accessed.
@@ -6375,16 +6384,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
return false;
}
- if (!slp
- && !PURE_SLP_STMT (stmt_info)
- && !STMT_VINFO_STRIDED_P (stmt_info))
- {
- if (vect_load_lanes_supported (vectype, group_size))
- load_lanes_p = true;
- else if (!vect_grouped_load_supported (vectype, group_size))
- return false;
- }
-
/* Invalidate assumptions made by dependence analysis when vectorization
on the unrolled body effectively re-orders stmts. */
if (!PURE_SLP_STMT (stmt_info)
Index: gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_load_lanes } */
+
+void
+f (int *__restrict a, int *__restrict b)
+{
+ for (int i = 0; i < 96; ++i)
+ a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
+}
+
+/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
+/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: Avoid unnecessary peeling for gaps with LD3
2016-05-20 15:27 Avoid unnecessary peeling for gaps with LD3 Richard Sandiford
@ 2016-05-23 8:29 ` Richard Biener
0 siblings, 0 replies; 2+ messages in thread
From: Richard Biener @ 2016-05-23 8:29 UTC (permalink / raw)
To: GCC Patches, richard.sandiford
On Fri, May 20, 2016 at 5:27 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> vectorizable_load forces peeling for gaps if the vectorisation factor
> is not a multiple of the group size, since in that case we'd normally load
> beyond the original scalar accesses but drop the excess elements as part
> of a following permute:
>
> if (loop_vinfo
> && ! STMT_VINFO_STRIDED_P (stmt_info)
> && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
> || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
>
> This isn't necessary for LOAD_LANES though, since it loads only the
> data needed and does the permute itself.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Ok.
Thanks,
Richard.
> Thanks,
> Richard
>
>
> gcc/
> * tree-vect-stmts.c (vectorizable_load): Reorder checks so that
> load_lanes/grouped_load classification comes first. Don't check
> whether the vectorization factor is a multiple of the group size
> for load_lanes.
>
> gcc/testsuite/
> * gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
>
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c
> +++ gcc/tree-vect-stmts.c
> @@ -6314,6 +6314,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
> gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
>
> first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> + group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> +
> + if (!slp
> + && !PURE_SLP_STMT (stmt_info)
> + && !STMT_VINFO_STRIDED_P (stmt_info))
> + {
> + if (vect_load_lanes_supported (vectype, group_size))
> + load_lanes_p = true;
> + else if (!vect_grouped_load_supported (vectype, group_size))
> + return false;
> + }
>
> /* If this is single-element interleaving with an element distance
> that leaves unused vector loads around punt - we at least create
> @@ -6341,7 +6352,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
> if (loop_vinfo
> && ! STMT_VINFO_STRIDED_P (stmt_info)
> && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
> - || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
> + || (!slp && !load_lanes_p && vf % group_size != 0)))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6361,8 +6372,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
> if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> slp_perm = true;
>
> - group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> -
> /* ??? The following is overly pessimistic (as well as the loop
> case above) in the case we can statically determine the excess
> elements loaded are within the bounds of a decl that is accessed.
> @@ -6375,16 +6384,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
> return false;
> }
>
> - if (!slp
> - && !PURE_SLP_STMT (stmt_info)
> - && !STMT_VINFO_STRIDED_P (stmt_info))
> - {
> - if (vect_load_lanes_supported (vectype, group_size))
> - load_lanes_p = true;
> - else if (!vect_grouped_load_supported (vectype, group_size))
> - return false;
> - }
> -
> /* Invalidate assumptions made by dependence analysis when vectorization
> on the unrolled body effectively re-orders stmts. */
> if (!PURE_SLP_STMT (stmt_info)
> Index: gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
> ===================================================================
> --- /dev/null
> +++ gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target vect_load_lanes } */
> +
> +void
> +f (int *__restrict a, int *__restrict b)
> +{
> + for (int i = 0; i < 96; ++i)
> + a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
> +}
> +
> +/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2016-05-23 8:29 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-20 15:27 Avoid unnecessary peeling for gaps with LD3 Richard Sandiford
2016-05-23 8:29 ` Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).