Index: testsuite/gcc.dg/vect/vect-outer-4g.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4g.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4g.c (revision 0) @@ -0,0 +1,54 @@ +/* { dg-require-effective-target vect_int } */ +#include +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s,sum=0; + + check_vect (); + + sum=foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s += diff; + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4i.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4i.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4i.c (revision 0) @@ -0,0 +1,28 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +unsigned char in[N+M]; +unsigned short out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned short +foo (){ + int i,j; + unsigned short diff; + unsigned short s=0; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4j.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4j.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4j.c (revision 0) @@ -0,0 +1,26 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +unsigned char in[N+M]; +unsigned short out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +void +foo (){ + int i,j; + unsigned short diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + out[i]=diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c =================================================================== --- testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c (revision 0) +++ testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c (revision 0) @@ -0,0 +1,47 @@ +/* { dg-require-effective-target vect_int } */ +#include +#include "tree-vect.h" + + +#define N 40 +#define M 128 +unsigned short a[M][N]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + unsigned int diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j++) { + a[j][i] = 4; + } + out[i]=5; + } +} + +int main (void) +{ + int i, j; + check_vect (); + + foo (); + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j++) { + if (a[j][i] != 4) + abort (); + } + if (out[i] != 5) + abort (); + } + + return 0; +} + + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4k.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4k.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4k.c (revision 0) @@ -0,0 +1,54 @@ +/* { dg-require-effective-target vect_int } */ +#include +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=(diff>>3); + } + return s; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s,sum=0; + + check_vect (); + + sum=foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s += (diff>>3); + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4l.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4l.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4l.c (revision 0) @@ -0,0 +1,56 @@ +/* { dg-require-effective-target vect_int } */ +#include +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; +unsigned char arr[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s,sum=0; + + check_vect (); + + sum=foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s += diff; + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4e.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4e.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4e.c (revision 0) @@ -0,0 +1,25 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +unsigned int in[N+M]; +unsigned short out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + unsigned int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + out[i]=(unsigned short)diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4m.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4m.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4m.c (revision 0) @@ -0,0 +1,54 @@ +/* { dg-require-effective-target vect_int } */ +#include +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=((unsigned short)diff>>3); + } + return s; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s,sum=0; + + check_vect (); + + sum=foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s += (diff>>3); + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4f.c =================================================================== --- testsuite/gcc.dg/vect/vect-outer-4f.c (revision 0) +++ testsuite/gcc.dg/vect/vect-outer-4f.c (revision 0) @@ -0,0 +1,49 @@ +/* { dg-require-effective-target vect_int } */ +#include +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +void +foo (){ + int i,j; + unsigned int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + out[i]=diff; + } +} + +int main (void) +{ + int i, j; + unsigned int diff; + + check_vect (); + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + if (out[i] != diff) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Index: tree-vect-analyze.c =================================================================== --- tree-vect-analyze.c (revision 127086) +++ tree-vect-analyze.c (working copy) @@ -588,14 +588,14 @@ return false; } if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && LOOP_VINFO_NITERS (loop_vinfo) - && TREE_CODE (LOOP_VINFO_NITERS (loop_vinfo)) == COND_EXPR) - { - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) - fprintf (vect_dump, - "not vectorized: can't create epilog loop 2."); - return false; - } + && LOOP_VINFO_NITERS (loop_vinfo) + && TREE_CODE (LOOP_VINFO_NITERS (loop_vinfo)) == COND_EXPR) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) + fprintf (vect_dump, + "not vectorized: can't create epilog loop 2."); + return false; + } if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop))) { if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) @@ -3039,16 +3039,43 @@ /* This is the last stmt in a sequence that was detected as a pattern that can potentially be vectorized. Don't mark the stmt - as relevant/live because it's not going to vectorized. + as relevant/live because it's not going to be vectorized. Instead mark the pattern-stmt that replaces it. */ - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live."); + pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info); - stmt_info = vinfo_for_stmt (pattern_stmt); - gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt); - save_relevant = STMT_VINFO_RELEVANT (stmt_info); - save_live_p = STMT_VINFO_LIVE_P (stmt_info); - stmt = pattern_stmt; + + /* One exception to the above is when the pattern-stmt is an + "unordered reduction" operation, whose results are used in the + outer-loop, in which case the order of the generated + results is important, and therefore we can't vectorize the pattern. + + An "unordered reduction" is a reduction that is vectorized without + preserving all the intermediate results, like widen_sum and dot_prod, + that produce only N/2 results (by summing up pairs of intermediate + results). If these results are actually used (e.g., stored, in an + outer-loop), we need to have all N results (and in the right order). + Therefore, in such a case, we cannot vectorize the reduction pattern, + and need to resort to vectorizing the original stmts. */ + if ((TREE_CODE (GIMPLE_STMT_OPERAND (pattern_stmt, 1)) == WIDEN_SUM_EXPR + || TREE_CODE (GIMPLE_STMT_OPERAND (pattern_stmt,1)) == DOT_PROD_EXPR) + && (relevant == vect_used_in_outer + || relevant == vect_used_in_outer_by_reduction)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "skip unordered reduction pattern."); + STMT_VINFO_RELATED_STMT (stmt_info) = NULL_TREE; + STMT_VINFO_IN_PATTERN_P (stmt_info) = false; + } + else + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live."); + stmt_info = vinfo_for_stmt (pattern_stmt); + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt); + save_relevant = STMT_VINFO_RELEVANT (stmt_info); + save_live_p = STMT_VINFO_LIVE_P (stmt_info); + stmt = pattern_stmt; + } } STMT_VINFO_LIVE_P (stmt_info) |= live_p; @@ -3391,12 +3418,11 @@ Reduction phis are expected to be used by a reduction stmt, or by in an outer loop; Other reduction stmts are expected to be in the loop, and possibly used by a stmt in an outer loop. - are the expected values of "relevant" for reduction phis/stmts in - op: + Here are the expected values of "relevant" for reduction phis/stmts: relevance: phi stmt vect_unused_in_loop ok - vect_used_in_outer_by_reductio ok ok + vect_used_in_outer_by_reduction ok ok vect_used_in_outer ok ok vect_used_by_reduction ok vect_used_in_loop */ @@ -3413,6 +3439,8 @@ case vect_used_in_outer_by_reduction: case vect_used_in_outer: + gcc_assert (TREE_CODE (stmt) != WIDEN_SUM_EXPR + && TREE_CODE (stmt) != DOT_PROD_EXPR); break; case vect_used_by_reduction: Index: tree-vect-transform.c =================================================================== --- tree-vect-transform.c (revision 127086) +++ tree-vect-transform.c (working copy) @@ -1956,7 +1956,6 @@ vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info); gcc_assert (vec_stmt_for_operand); vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0); - return vec_oprnd; } @@ -2499,6 +2498,7 @@ } /* We expect to have found an exit_phi because of loop-closed-ssa form. */ gcc_assert (exit_phi); + if (nested_in_vect_loop) { stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); @@ -2510,6 +2510,9 @@ epilog_stmt = adjustment_def ? epilog_stmt : new_phi; STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; + set_stmt_info (get_stmt_ann (epilog_stmt), + new_stmt_vec_info (epilog_stmt, loop_vinfo)); + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "vector of partial results after inner-loop:");