* [patch] [autovect] bugfix in outer-loop vectorization
@ 2007-08-01 7:22 Dorit Nuzman
0 siblings, 0 replies; only message in thread
From: Dorit Nuzman @ 2007-08-01 7:22 UTC (permalink / raw)
To: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 1516 bytes --]
We can't allow vectorization of widneing reduction patterns like DOT_PROD
and WIDEN_SUM when their results are used in the loop. This is because
these idioms compute N widened results and then reduce them into N/2
results by summing-up pairs of results. So if we need to use (say store)
the original N results we cannot use this idiom. (In innermost-loop
vectorization this cannot happen because a reduction cannot be used in the
loop. However, in outer-loop vectorization, a reduction in the inner-loop
can be used in the outer-loop).
Bootstrapped with vectorization enabled and tested on the vectorizer
testcases on powerpc64-linux. Committed to autovect branch.
dorit
* tree-vect-analyze.c (vect_mark_relevant): Don't mark widening
reduction patterns as relevant if they are used in the outer-loop
(we want to vectorize the original sequence instead).
(vect_mark_stmts_to_be_vectorized): Add an assert.
* tree-vect-transform.c (vect_finalize_reduction): Set a stmt_info.
* gcc.dg/vect/vect-outer-4e.c: New test.
* gcc.dg/vect/vect-outer-4f.c: New test.
* gcc.dg/vect/vect-outer-4g.c: New test.
* gcc.dg/vect/no-section-anchors-vect-outer-4h.c: New test.
* gcc.dg/vect/vect-outer-4i.c: New test.
* gcc.dg/vect/vect-outer-4j.c: New test.
* gcc.dg/vect/vect-outer-4k.c: New test.
* gcc.dg/vect/vect-outer-4l.c: New test.
* gcc.dg/vect/vect-outer-4m.c: New test.
(See attached file: autovectfix.txt)
[-- Attachment #2: autovectfix.txt --]
[-- Type: text/plain, Size: 14654 bytes --]
Index: testsuite/gcc.dg/vect/vect-outer-4g.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4g.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4g.c (revision 0)
@@ -0,0 +1,54 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop. */
+
+unsigned int
+foo (){
+ int i,j;
+ unsigned int diff;
+ unsigned int s=0;
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s+=diff;
+ }
+ return s;
+}
+
+int main (void)
+{
+ int i, j;
+ unsigned int diff;
+ unsigned int s,sum=0;
+
+ check_vect ();
+
+ sum=foo ();
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s += diff;
+ }
+
+ if (s != sum)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4i.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4i.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4i.c (revision 0)
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+
+#define N 40
+#define M 128
+unsigned char in[N+M];
+unsigned short out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop. */
+
+unsigned short
+foo (){
+ int i,j;
+ unsigned short diff;
+ unsigned short s=0;
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s+=diff;
+ }
+ return s;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4j.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4j.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4j.c (revision 0)
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+
+#define N 40
+#define M 128
+unsigned char in[N+M];
+unsigned short out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop. */
+
+void
+foo (){
+ int i,j;
+ unsigned short diff;
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ out[i]=diff;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c
===================================================================
--- testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c (revision 0)
+++ testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c (revision 0)
@@ -0,0 +1,47 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+
+#define N 40
+#define M 128
+unsigned short a[M][N];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+
+void
+foo (){
+ int i,j;
+ unsigned int diff;
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j++) {
+ a[j][i] = 4;
+ }
+ out[i]=5;
+ }
+}
+
+int main (void)
+{
+ int i, j;
+ check_vect ();
+
+ foo ();
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j++) {
+ if (a[j][i] != 4)
+ abort ();
+ }
+ if (out[i] != 5)
+ abort ();
+ }
+
+ return 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4k.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4k.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4k.c (revision 0)
@@ -0,0 +1,54 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop. */
+
+unsigned int
+foo (){
+ int i,j;
+ unsigned int diff;
+ unsigned int s=0;
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s+=(diff>>3);
+ }
+ return s;
+}
+
+int main (void)
+{
+ int i, j;
+ unsigned int diff;
+ unsigned int s,sum=0;
+
+ check_vect ();
+
+ sum=foo ();
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s += (diff>>3);
+ }
+
+ if (s != sum)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4l.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4l.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4l.c (revision 0)
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+unsigned char arr[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop. */
+
+unsigned int
+foo (){
+ int i,j;
+ unsigned int diff;
+ unsigned int s=0;
+
+ for (i = 0; i < N; i++) {
+ arr[i] = 3;
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s+=diff;
+ }
+ return s;
+}
+
+int main (void)
+{
+ int i, j;
+ unsigned int diff;
+ unsigned int s,sum=0;
+
+ check_vect ();
+
+ sum=foo ();
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s += diff;
+ }
+
+ if (s != sum)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4e.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4e.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4e.c (revision 0)
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+
+#define N 40
+#define M 128
+unsigned int in[N+M];
+unsigned short out[N];
+
+/* Outer-loop vectorization. */
+
+void
+foo (){
+ int i,j;
+ unsigned int diff;
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ out[i]=(unsigned short)diff;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4m.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4m.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4m.c (revision 0)
@@ -0,0 +1,54 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop. */
+
+unsigned int
+foo (){
+ int i,j;
+ unsigned int diff;
+ unsigned int s=0;
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s+=((unsigned short)diff>>3);
+ }
+ return s;
+}
+
+int main (void)
+{
+ int i, j;
+ unsigned int diff;
+ unsigned int s,sum=0;
+
+ check_vect ();
+
+ sum=foo ();
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ s += (diff>>3);
+ }
+
+ if (s != sum)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4f.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4f.c (revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4f.c (revision 0)
@@ -0,0 +1,49 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop. */
+
+void
+foo (){
+ int i,j;
+ unsigned int diff;
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ out[i]=diff;
+ }
+}
+
+int main (void)
+{
+ int i, j;
+ unsigned int diff;
+
+ check_vect ();
+
+ foo ();
+
+ for (i = 0; i < N; i++) {
+ diff = 0;
+ for (j = 0; j < M; j+=8) {
+ diff += in[j+i];
+ }
+ if (out[i] != diff)
+ abort ();
+ }
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: tree-vect-analyze.c
===================================================================
--- tree-vect-analyze.c (revision 127086)
+++ tree-vect-analyze.c (working copy)
@@ -588,14 +588,14 @@
return false;
}
if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && LOOP_VINFO_NITERS (loop_vinfo)
- && TREE_CODE (LOOP_VINFO_NITERS (loop_vinfo)) == COND_EXPR)
- {
- if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
- fprintf (vect_dump,
- "not vectorized: can't create epilog loop 2.");
- return false;
- }
+ && LOOP_VINFO_NITERS (loop_vinfo)
+ && TREE_CODE (LOOP_VINFO_NITERS (loop_vinfo)) == COND_EXPR)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+ fprintf (vect_dump,
+ "not vectorized: can't create epilog loop 2.");
+ return false;
+ }
if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
{
if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
@@ -3039,16 +3039,43 @@
/* This is the last stmt in a sequence that was detected as a
pattern that can potentially be vectorized. Don't mark the stmt
- as relevant/live because it's not going to vectorized.
+ as relevant/live because it's not going to be vectorized.
Instead mark the pattern-stmt that replaces it. */
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
+
pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
- stmt_info = vinfo_for_stmt (pattern_stmt);
- gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
- save_relevant = STMT_VINFO_RELEVANT (stmt_info);
- save_live_p = STMT_VINFO_LIVE_P (stmt_info);
- stmt = pattern_stmt;
+
+ /* One exception to the above is when the pattern-stmt is an
+ "unordered reduction" operation, whose results are used in the
+ outer-loop, in which case the order of the generated
+ results is important, and therefore we can't vectorize the pattern.
+
+ An "unordered reduction" is a reduction that is vectorized without
+ preserving all the intermediate results, like widen_sum and dot_prod,
+ that produce only N/2 results (by summing up pairs of intermediate
+ results). If these results are actually used (e.g., stored, in an
+ outer-loop), we need to have all N results (and in the right order).
+ Therefore, in such a case, we cannot vectorize the reduction pattern,
+ and need to resort to vectorizing the original stmts. */
+ if ((TREE_CODE (GIMPLE_STMT_OPERAND (pattern_stmt, 1)) == WIDEN_SUM_EXPR
+ || TREE_CODE (GIMPLE_STMT_OPERAND (pattern_stmt,1)) == DOT_PROD_EXPR)
+ && (relevant == vect_used_in_outer
+ || relevant == vect_used_in_outer_by_reduction))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "skip unordered reduction pattern.");
+ STMT_VINFO_RELATED_STMT (stmt_info) = NULL_TREE;
+ STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
+ }
+ else
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
+ stmt_info = vinfo_for_stmt (pattern_stmt);
+ gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
+ save_relevant = STMT_VINFO_RELEVANT (stmt_info);
+ save_live_p = STMT_VINFO_LIVE_P (stmt_info);
+ stmt = pattern_stmt;
+ }
}
STMT_VINFO_LIVE_P (stmt_info) |= live_p;
@@ -3391,12 +3418,11 @@
Reduction phis are expected to be used by a reduction stmt, or by
in an outer loop; Other reduction stmts are expected to be
in the loop, and possibly used by a stmt in an outer loop.
- are the expected values of "relevant" for reduction phis/stmts in
- op:
+ Here are the expected values of "relevant" for reduction phis/stmts:
relevance: phi stmt
vect_unused_in_loop ok
- vect_used_in_outer_by_reductio ok ok
+ vect_used_in_outer_by_reduction ok ok
vect_used_in_outer ok ok
vect_used_by_reduction ok
vect_used_in_loop */
@@ -3413,6 +3439,8 @@
case vect_used_in_outer_by_reduction:
case vect_used_in_outer:
+ gcc_assert (TREE_CODE (stmt) != WIDEN_SUM_EXPR
+ && TREE_CODE (stmt) != DOT_PROD_EXPR);
break;
case vect_used_by_reduction:
Index: tree-vect-transform.c
===================================================================
--- tree-vect-transform.c (revision 127086)
+++ tree-vect-transform.c (working copy)
@@ -1956,7 +1956,6 @@
vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
gcc_assert (vec_stmt_for_operand);
vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
-
return vec_oprnd;
}
@@ -2499,6 +2498,7 @@
}
/* We expect to have found an exit_phi because of loop-closed-ssa form. */
gcc_assert (exit_phi);
+
if (nested_in_vect_loop)
{
stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
@@ -2510,6 +2510,9 @@
epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
+ set_stmt_info (get_stmt_ann (epilog_stmt),
+ new_stmt_vec_info (epilog_stmt, loop_vinfo));
+
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "vector of partial results after inner-loop:");
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2007-08-01 7:22 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-08-01 7:22 [patch] [autovect] bugfix in outer-loop vectorization Dorit Nuzman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).