Index: testsuite/gcc.dg/vect/vect-widen-mult-sum.c =================================================================== *** testsuite/gcc.dg/vect/vect-widen-mult-sum.c (revision 127202) --- testsuite/gcc.dg/vect/vect-widen-mult-sum.c (working copy) *************** int main (void) *** 42,45 **** --- 42,46 ---- /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_widen_mult_hi_to_si } } } */ + /* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-2b.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-2b.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-2b.c (revision 0) *************** *** 0 **** --- 1,41 ---- + /* { dg-require-effective-target vect_float } */ + #include + #include "tree-vect.h" + + #define N 40 + float image[2*N][N][N] __attribute__ ((__aligned__(16))); + + void + foo (){ + int i,j,k; + + for (k=0; k + #include "tree-vect.h" + + #define N 40 + #define M 128 + float in[N+M]; + float coeff[M]; + float out[N]; + + /* Outer-loop vectorization. */ + + void + foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + out[i]=diff; + } + } + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + if (out[i] != diff) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ + /* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-7.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-7.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-7.c (revision 0) *************** *** 0 **** --- 1,75 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 16 + + unsigned short in[N]; + unsigned short coef[N]; + unsigned short a[N]; + + unsigned int + foo (short scale){ + int i; + unsigned short j; + unsigned int sum = 0; + unsigned short sum_j; + + for (i = 0; i < N; i++) { + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + a[i] = sum_j; + sum += ((unsigned int) in[i] * (unsigned int) coef[i]) >> scale; + } + return sum; + } + + unsigned short + bar (void) + { + unsigned short j; + unsigned short sum_j; + + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + + return sum_j; + } + + int main (void) + { + int i; + unsigned short j, sum_j; + unsigned int sum = 0; + unsigned int res; + + check_vect (); + + for (i=0; i> 2; + } + if (res != sum) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { target vect_widen_mult_hi_to_si } } } */ + /* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4g.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-4g.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-4g.c (revision 0) *************** *** 0 **** --- 1,70 ---- + /* { dg-require-effective-target vect_int } */ + #include + #include "tree-vect.h" + + #define N 40 + #define M 128 + unsigned short in[N+M]; + unsigned int out[N]; + unsigned char arr[N]; + + /* Outer-loop vectorization. */ + /* Not vectorized due to multiple-types in the inner-loop. */ + + unsigned int + foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; + } + + unsigned int + bar (int i, unsigned int diff, unsigned short *in) + { + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; + } + + int main (void) + { + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-10.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-10.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-10.c (revision 0) *************** *** 0 **** --- 1,54 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + int b[N]; + + int + foo (int n){ + int i,j; + int sum,x,y; + + for (i = 0; i < N/2; i++) { + sum = 0; + x = b[2*i]; + y = b[2*i+1]; + for (j = 0; j < n; j++) { + sum += j; + } + a[2*i] = sum + x; + a[2*i+1] = sum + y; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + + int a[N]; + int b[N]; + + int + foo (int n){ + int i,j; + int sum,x,y; + + if (n<=0) + return 0; + + for (i = 0; i < N/2; i++) { + sum = 0; + x = b[2*i]; + y = b[2*i+1]; + j = 0; + do { + sum += j; + } while (++j < n); + a[2*i] = sum + x; + a[2*i+1] = sum + y; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (){ + int i,j; + int sum; + + for (i = 0; i < N/2; i++) { + sum = 0; + for (j = 0; j < N; j++) { + sum += j; + } + a[2*i] = sum; + a[2*i+1] = 2*sum; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + float image[N][N+1] __attribute__ ((__aligned__(16))); + float out[N]; + + /* Outer-loop vectorization with misaliged accesses in the inner-loop. */ + + void + foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + out[i]=diff; + } + } + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + if (out[i] != diff) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */ + /* { dg-final { scan-tree-dump-times "step doesn't divide the vector-size" 2 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-5.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-5.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-5.c (revision 0) *************** *** 0 **** --- 1,80 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include + #include "tree-vect.h" + + #define N 64 + #define MAX 42 + + extern void abort(void); + + int main1 () + { + float A[N] __attribute__ ((__aligned__(16))); + float B[N] __attribute__ ((__aligned__(16))); + float C[N] __attribute__ ((__aligned__(16))); + float D[N] __attribute__ ((__aligned__(16))); + float s; + + int i, j; + + for (i = 0; i < N; i++) + { + A[i] = i; + B[i] = i; + C[i] = i; + D[i] = i; + } + + /* Outer-loop 1: Vectorizable with respect to dependence distance. */ + for (i = 0; i < N-20; i++) + { + s = 0; + for (j=0; j + #include "tree-vect.h" + + #define N 40 + float image[2*N][2*N][N] __attribute__ ((__aligned__(16))); + + void + foo (){ + int i,j,k; + + for (k=0; k + #include "tree-vect.h" + + #define N 40 + + + int + foo (int *a){ + int i,j; + int sum; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < N; j++) { + sum += j; + } + a[i] = sum; + } + } + + int main (void) + { + int i,j; + int sum; + int a[N]; + + check_vect (); + + for (i=0; i + #include "../../tree-vect.h" + + #define N 32 + #define M 16 + float in[N+M]; + float coeff[M]; + float out[N]; + float fir_out[N]; + + /* Vectorized. Fixed misaligment in the inner-loop. */ + void foo (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + out[i] = 0; + } + + for (k = 0; k < 4; k++) { + for (i = 0; i < N; i++) { + diff = 0; + for (j = k; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + out[i] += diff; + } + } + } + + /* Vectorized. Changing misalignment in the inner-loop. */ + void fir (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j++) { + diff += in[j+i]*coeff[j]; + } + fir_out[i] = diff; + } + } + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + fir (); + + for (i = 0; i < N; i++) { + if (out[i] != fir_out[i]) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 2 "vect" { xfail vect_no_align } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-11.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-11.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-11.c (revision 0) *************** *** 0 **** --- 1,50 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (int n){ + int i,j; + int sum; + + for (i = 0; i < n; i++) { + sum = 0; + for (j = 0; j < N; j++) { + sum += j; + } + a[i] = sum; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + + int a[N]; + int b[N]; + + int + foo (int n){ + int i,j; + int sum,x,y; + + if (n<=0) + return 0; + + for (i = 0; i < N/2; i++) { + sum = 0; + x = b[2*i]; + y = b[2*i+1]; + for (j = 0; j < n; j++) { + sum += j; + } + a[2*i] = sum + x; + a[2*i+1] = sum + y; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + unsigned short a[N]; + unsigned int b[N]; + + int + foo (){ + unsigned short i,j; + unsigned short sum; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < N; j++) { + sum += j; + } + a[i] = sum; + b[i] = (unsigned int)sum; + } + } + + int main (void) + { + int i,j; + short sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + float image[N][N] __attribute__ ((__aligned__(16))); + float out[N]; + + /* Outer-loop vectorization with non-consecutive access. Not vectorized yet. */ + + void + foo (){ + int i,j; + float diff; + + for (i = 0; i < N/2; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][2*i]; + } + out[i]=diff; + } + } + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N/2; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][2*i]; + } + if (out[i] != diff) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-20.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-20.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-20.c (revision 0) *************** *** 0 **** --- 1,54 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + int b[N]; + + int + foo (){ + int i,j; + int sum,x,y; + + for (i = 0; i < N/2; i++) { + sum = 0; + x = b[2*i]; + y = b[2*i+1]; + for (j = 0; j < N; j++) { + sum += j; + } + a[2*i] = sum + x; + a[2*i+1] = sum + y; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include + #include "tree-vect.h" + + #define N 64 + #define MAX 42 + + float A[N] __attribute__ ((__aligned__(16))); + float B[N] __attribute__ ((__aligned__(16))); + float C[N] __attribute__ ((__aligned__(16))); + float D[N] __attribute__ ((__aligned__(16))); + extern void abort(void); + + int main1 () + { + float s; + + int i, j; + + for (i = 0; i < 8; i++) + { + s = 0; + for (j=0; j<8; j+=4) + s += C[j]; + A[i] = s; + } + + return 0; + } + + int main () + { + int i,j; + float s; + + check_vect (); + + for (i = 0; i < N; i++) + { + A[i] = i; + B[i] = i; + C[i] = i; + D[i] = i; + } + + main1(); + + /* check results: */ + for (i = 0; i < 8; i++) + { + s = 0; + for (j=0; j<8; j+=4) + s += C[j]; + if (A[i] != s) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ + /* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-9.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-9.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-9.c (revision 0) *************** *** 0 **** --- 1,50 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (int n){ + int i,j; + int sum; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < n; j++) { + sum += j; + } + a[i] = sum; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + int a[N]; + short b[N]; + + int + foo (){ + int i,j; + int sum; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < N; j++) { + sum += j; + } + a[i] = sum; + b[i] = (short)sum; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + foo (); + + /* check results: */ + for (i=0; i + #include "tree-vect.h" + + #define N 40 + float image[N][N+1] __attribute__ ((__aligned__(16))); + float out[N]; + + /* Outer-loop vectorization. */ + + void + foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j+=4) { + diff += image[j][i]; + } + out[i]=diff; + } + } + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j+=4) { + diff += image[j][i]; + } + if (out[i] != diff) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-fir-lb.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-fir-lb.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-fir-lb.c (revision 0) *************** *** 0 **** --- 1,80 ---- + /* { dg-require-effective-target vect_float } */ + + #include + #include "tree-vect.h" + + #define N 40 + #define M 64 + float in[N+M]; + float coeff[M]; + float out[N]; + float fir_out[N]; + + /* Should be vectorized. Fixed misaligment in the inner-loop. */ + /* Currently not vectorized because the loop-count for the inner-loop + has a maybe_zero component. Will be fixed when we incorporate the + "cond_expr in rhs" patch. */ + void foo (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + out[i] = 0; + } + + for (k = 0; k < 4; k++) { + for (i = 0; i < N; i++) { + diff = 0; + j = k; + + do { + diff += in[j+i]*coeff[j]; + j+=4; + } while (j < M); + + out[i] += diff; + } + } + + } + + /* Vectorized. Changing misalignment in the inner-loop. */ + void fir (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j++) { + diff += in[j+i]*coeff[j]; + } + fir_out[i] = diff; + } + } + + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + fir (); + + for (i = 0; i < N; i++) { + if (out[i] != fir_out[i]) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 2 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-21.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-21.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-21.c (revision 0) *************** *** 0 **** --- 1,62 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (){ + int i; + unsigned short j; + int sum = 0; + unsigned short sum_j; + + for (i = 0; i < N; i++) { + sum += i; + + sum_j = i; + for (j = 0; j < N; j++) { + sum_j += j; + } + a[i] = sum_j + 5; + } + return sum; + } + + int main (void) + { + int i; + unsigned short j, sum_j; + int sum = 0; + int res; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 16 + + unsigned short in[N]; + + unsigned int + foo (short scale){ + int i; + unsigned short j; + unsigned int sum = 0; + unsigned short sum_j; + + for (i = 0; i < N; i++) { + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + sum += ((unsigned int) in[i] * (unsigned int) sum_j) >> scale; + } + return sum; + } + + unsigned short + bar (void) + { + unsigned short j; + unsigned short sum_j; + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + return sum_j; + } + + int main (void) + { + int i; + unsigned short j, sum_j; + unsigned int sum = 0; + unsigned int res; + + check_vect (); + + for (i=0; i> 2; + } + if (res != sum) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { target vect_widen_mult_hi_to_si } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c =================================================================== *** testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c (revision 0) --- testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c (revision 0) *************** *** 0 **** --- 1,47 ---- + /* { dg-require-effective-target vect_int } */ + #include + #include "tree-vect.h" + + + #define N 40 + #define M 128 + unsigned short a[M][N]; + unsigned int out[N]; + + /* Outer-loop vectorization. */ + + void + foo (){ + int i,j; + unsigned int diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j++) { + a[j][i] = 4; + } + out[i]=5; + } + } + + int main (void) + { + int i, j; + check_vect (); + + foo (); + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j++) { + if (a[j][i] != 4) + abort (); + } + if (out[i] != 5) + abort (); + } + + return 0; + } + + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-22.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-22.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-22.c (revision 0) *************** *** 0 **** --- 1,54 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (int n){ + int i,j; + int sum; + + if (n<=0) + return 0; + + /* inner-loop index j used after the inner-loop */ + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < n; j+=2) { + sum += j; + } + a[i] = sum + j; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (){ + int i,j; + int sum; + + /* inner-loop step > 1 */ + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < N; j+=2) { + sum += j; + } + a[i] = sum; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + #define M 128 + unsigned short in[N+M]; + unsigned int out[N]; + unsigned char arr[N]; + + /* Outer-loop vectorization. */ + /* Not vectorized due to multiple-types in the inner-loop. */ + + unsigned int + foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; + } + + unsigned int + bar (int i, unsigned int diff, unsigned short *in) + { + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; + } + + int main (void) + { + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-14.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-14.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-14.c (revision 0) *************** *** 0 **** --- 1,61 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 64 + + unsigned short + foo (short scale){ + int i; + unsigned short j; + unsigned short sum = 0; + unsigned short sum_j; + + for (i = 0; i < N; i++) { + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + sum += sum_j; + } + return sum; + } + + unsigned short + bar (void) + { + unsigned short j; + unsigned short sum_j; + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + return sum_j; + } + + int main (void) + { + int i; + unsigned short j, sum_j; + unsigned short sum = 0; + unsigned short res; + + check_vect (); + + res = foo (2); + + /* check results: */ + for (i=0; i - #include "tree-vect.h" - - #define N 26 - - int main1 (int X) - { - int s = X; - int i; - - /* vectorization of reduction with induction. - Need -fno-tree-scev-cprop or else the loop is eliminated. */ - for (i = 0; i < N; i++) - s += i; - - return s; - } - - int main (void) - { - int s; - check_vect (); - - s = main1 (3); - if (s != 328) - abort (); - - return 0; - } - - /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ - /* { dg-final { cleanup-tree-dump "vect" } } */ --- 0 ---- Index: testsuite/gcc.dg/vect/vect-outer-1.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-1.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-1.c (revision 0) *************** *** 0 **** --- 1,26 ---- + /* { dg-do compile } */ + + #define N 40 + signed short image[N][N] __attribute__ ((__aligned__(16))); + signed short block[N][N] __attribute__ ((__aligned__(16))); + signed short out[N] __attribute__ ((__aligned__(16))); + + /* Can't do outer-loop vectorization because of non-consecutive access. */ + + void + foo (){ + int i,j; + int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j+=8) { + diff += (image[i][j] - block[i][j]); + } + out[i]=diff; + } + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-4.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-4.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-4.c (revision 0) *************** *** 0 **** --- 1,55 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + /* induction variable k advances through inner and outer loops. */ + + int + foo (int n){ + int i,j,k=0; + int sum; + + if (n<=0) + return 0; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < n; j+=2) { + sum += k++; + } + a[i] = sum + j; + } + } + + int main (void) + { + int i,j,k=0; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + #define M 128 + float in[N+M]; + float out[N]; + + /* Outer-loop vectorization. */ + + void + foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]; + } + out[i]=diff; + } + } + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) + in[i] = i; + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]; + } + if (out[i] != diff) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-4l.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-4l.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-4l.c (revision 0) *************** *** 0 **** --- 1,70 ---- + /* { dg-require-effective-target vect_int } */ + #include + #include "tree-vect.h" + + #define N 40 + #define M 128 + unsigned short in[N+M]; + unsigned int out[N]; + unsigned char arr[N]; + + /* Outer-loop vectorization. */ + /* Not vectorized due to multiple-types in the inner-loop. */ + + unsigned int + foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; + } + + unsigned int + bar (int i, unsigned int diff, unsigned short *in) + { + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; + } + + int main (void) + { + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-15.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-15.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-15.c (revision 0) *************** *** 0 **** --- 1,48 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (int x){ + int i,j; + int sum; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < N; j++) { + sum += j; + } + a[i] = sum + i + x; + } + } + + int main (void) + { + int i,j; + int sum; + int aa[N]; + + check_vect (); + + foo (3); + + /* check results: */ + for (i=0; i - #include "tree-vect.h" - - #define N 16 - - int main1 () - { - int arr1[N]; - int k = 0; - int m = 3, i = 0; - - /* Vectorization of induction that is used after the loop. - Currently vectorizable because scev_ccp disconnects the - use-after-the-loop from the iv def inside the loop. */ - - do { - k = k + 2; - arr1[i] = k; - m = m + k; - i++; - } while (i < N); - - /* check results: */ - for (i = 0; i < N; i++) - { - if (arr1[i] != 2+2*i) - abort (); - } - - return m + k; - } - - int main (void) - { - int res; - - check_vect (); - - res = main1 (); - if (res != 32 + 275) - abort (); - - return 0; - } - - /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */ - /* { dg-final { cleanup-tree-dump "vect" } } */ --- 0 ---- Index: testsuite/gcc.dg/vect/vect.exp =================================================================== *** testsuite/gcc.dg/vect/vect.exp (revision 127202) --- testsuite/gcc.dg/vect/vect.exp (working copy) *************** dg-runtest [lsort [glob -nocomplain $src *** 176,183 **** # -fno-tree-scev-cprop set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" ! dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-scev-cprop-*.\[cS\]]] \ ! "" $DEFAULT_VECTCFLAGS # -fno-tree-dominator-opts set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS --- 176,195 ---- # -fno-tree-scev-cprop set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" ! dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-vect-*.\[cS\]]] \ ! "" $DEFAULT_VECTCFLAGS ! ! # -fno-tree-scev-cprop ! set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS ! lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" ! dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-outer-*.\[cS\]]] \ ! "" $DEFAULT_VECTCFLAGS ! ! # -fno-tree-scev-cprop -fno-tree-reassoc ! set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS ! lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" "-fno-tree-reassoc" ! dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-noreassoc-*.\[cS\]]] \ ! "" $DEFAULT_VECTCFLAGS # -fno-tree-dominator-opts set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS Index: testsuite/gcc.dg/vect/vect-outer-2.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-2.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-2.c (revision 0) *************** *** 0 **** --- 1,40 ---- + /* { dg-require-effective-target vect_float } */ + #include + #include "tree-vect.h" + + #define N 40 + float image[N][N] __attribute__ ((__aligned__(16))); + float out[N]; + + /* Outer-loop vectorization. */ + + void + foo (){ + int i,j; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[j][i] = j+i; + } + } + } + + int main (void) + { + check_vect (); + int i, j; + + foo (); + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + if (image[j][i] != j+i) + abort (); + } + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-5.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-5.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-5.c (revision 0) *************** *** 0 **** --- 1,53 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (){ + int i,j; + int sum; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < N; j++) { + sum += j; + } + a[i] += sum + i; + } + } + + int main (void) + { + int i,j; + int sum; + int aa[N]; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (int n){ + int i,j; + int sum; + + if (n<=0) + return 0; + + for (i = 0; i < N; i++) { + sum = 0; + j = 0; + do { + sum += j; + }while (++j < n); + a[i] = sum; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + #define M 128 + unsigned short in[N+M]; + unsigned int out[N]; + + /* Outer-loop vectorization. */ + /* Not vectorized due to multiple-types in the inner-loop. */ + + unsigned int + foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=((unsigned short)diff>>3); + } + return s; + } + + int main (void) + { + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s += ((unsigned short)diff>>3); + } + + if (s != sum) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-16.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-16.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-16.c (revision 0) *************** *** 0 **** --- 1,62 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (){ + int i; + unsigned short j; + int sum = 0; + unsigned short sum_j; + + for (i = 0; i < N; i++) { + sum += i; + + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + a[i] = sum_j + 5; + } + return sum; + } + + int main (void) + { + int i; + unsigned short j, sum_j; + int sum = 0; + int res; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + #define M 128 + float in[N+M]; + float coeff[M]; + float out[N]; + float fir_out[N]; + + /* Should be vectorized. Fixed misaligment in the inner-loop. */ + /* Currently not vectorized because we get too many BBs in the inner-loop, + because the compiler doesn't realize that the inner-loop executes at + least once (cause k<4), and so there's no need to create a guard code + to skip the inner-loop in case it doesn't execute. */ + void foo (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + out[i] = 0; + } + + for (k = 0; k < 4; k++) { + for (i = 0; i < N; i++) { + diff = 0; + for (j = k; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + out[i] += diff; + } + } + + } + + /* Vectorized. Changing misalignment in the inner-loop. */ + void fir (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j++) { + diff += in[j+i]*coeff[j]; + } + fir_out[i] = diff; + } + } + + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + fir (); + + for (i = 0; i < N; i++) { + if (out[i] != fir_out[i]) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 2 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-outer-2a.c =================================================================== *** testsuite/gcc.dg/vect/vect-outer-2a.c (revision 0) --- testsuite/gcc.dg/vect/vect-outer-2a.c (revision 0) *************** *** 0 **** --- 1,41 ---- + /* { dg-require-effective-target vect_float } */ + #include + #include "tree-vect.h" + + #define N 40 + float image[N][N][N] __attribute__ ((__aligned__(16))); + + void + foo (){ + int i,j,k; + + for (k=0; k + #include "tree-vect.h" + + #define N 40 + float image[N][N] __attribute__ ((__aligned__(16))); + float out[N]; + + /* Outer-loop vectoriation. */ + + void + foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + out[i]=diff; + } + } + + int main (void) + { + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + if (out[i] != diff) + abort (); + } + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-6.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-6.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-6.c (revision 0) *************** *** 0 **** --- 1,56 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int + foo (int * __restrict__ b, int k){ + int i,j; + int sum,x; + int a[N]; + + for (i = 0; i < N; i++) { + sum = b[i]; + for (j = 0; j < N; j++) { + sum += j; + } + a[i] = sum; + } + + return a[k]; + } + + int main (void) + { + int i,j; + int sum; + int b[N]; + int a[N]; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + + int a[N]; + + int + foo (int n){ + int i,j; + int sum; + + if (n<=0) + return 0; + + for (i = 0; i < N; i++) { + sum = 0; + for (j = 0; j < n; j++) { + sum += j; + } + a[i] = sum; + } + } + + int main (void) + { + int i,j; + int sum; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 40 + #define M 128 + unsigned short in[N+M]; + unsigned int out[N]; + unsigned char arr[N]; + + /* Outer-loop vectorization. */ + /* Not vectorized due to multiple-types in the inner-loop. */ + + unsigned int + foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; + } + + unsigned int + bar (int i, unsigned int diff, unsigned short *in) + { + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; + } + + int main (void) + { + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/no-scevccp-outer-17.c =================================================================== *** testsuite/gcc.dg/vect/no-scevccp-outer-17.c (revision 0) --- testsuite/gcc.dg/vect/no-scevccp-outer-17.c (revision 0) *************** *** 0 **** --- 1,68 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 40 + + int a[N]; + int b[N]; + int c[N]; + + int + foo (){ + int i; + unsigned short j; + int sum = 0; + unsigned short sum_j; + + for (i = 0; i < N; i++) { + int diff = b[i] - c[i]; + + sum_j = 0; + for (j = 0; j < N; j++) { + sum_j += j; + } + a[i] = sum_j + 5; + + sum += diff; + } + return sum; + } + + int main (void) + { + int i; + unsigned short j, sum_j; + int sum = 0; + int res; + + check_vect (); + + for (i=0; inum_nodes; i++) { basic_block bb = bbs[i]; tree phi; for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) { stmt_ann_t ann = get_stmt_ann (phi); --- 1396,1444 ---- loop_vec_info res; basic_block *bbs; block_stmt_iterator si; ! unsigned int i, nbbs; res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info)); + LOOP_VINFO_LOOP (res) = loop; bbs = get_loop_body (loop); ! /* Create/Update stmt_info for all stmts in the loop. */ for (i = 0; i < loop->num_nodes; i++) { basic_block bb = bbs[i]; tree phi; + /* BBs in a nested inner-loop will have been already processed (because + we will have called vect_analyze_loop_form for any nested inner-loop). + Therefore, for stmts in an inner-loop we just want to update the + STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new + loop_info of the outer-loop we are currently considering to vectorize + (instead of the loop_info of the inner-loop). + For stmts in other BBs we need to create a stmt_info from scratch. */ + if (bb->loop_father != loop) + { + /* Inner-loop bb. */ + gcc_assert (loop->inner && bb->loop_father == loop->inner); + for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) + { + stmt_vec_info stmt_info = vinfo_for_stmt (phi); + loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo)); + STMT_VINFO_LOOP_VINFO (stmt_info) = res; + } + for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si)) + { + tree stmt = bsi_stmt (si); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo)); + STMT_VINFO_LOOP_VINFO (stmt_info) = res; + } + } + else + { + /* bb in current nest. */ for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) { stmt_ann_t ann = get_stmt_ann (phi); *************** new_loop_vec_info (struct loop *loop) *** 1396,1411 **** for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si)) { tree stmt = bsi_stmt (si); ! stmt_ann_t ann; ! ! ann = stmt_ann (stmt); set_stmt_info (ann, new_stmt_vec_info (stmt, res)); } } - LOOP_VINFO_LOOP (res) = loop; LOOP_VINFO_BBS (res) = bbs; - LOOP_VINFO_EXIT_COND (res) = NULL; LOOP_VINFO_NITERS (res) = NULL; LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0; LOOP_VINFO_VECTORIZABLE_P (res) = 0; --- 1448,1471 ---- for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si)) { tree stmt = bsi_stmt (si); ! stmt_ann_t ann = stmt_ann (stmt); set_stmt_info (ann, new_stmt_vec_info (stmt, res)); } } + } + + /* CHECKME: We want to visit all BBs before their successors (except for + latch blocks, for which this assertion wouldn't hold). In the simple + case of the loop forms we allow, a dfs order of the BBs would the same + as reversed postorder traversal, so we are safe. */ + + free (bbs); + bbs = XCNEWVEC (basic_block, loop->num_nodes); + nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, + bbs, loop->num_nodes, loop); + gcc_assert (nbbs == loop->num_nodes); LOOP_VINFO_BBS (res) = bbs; LOOP_VINFO_NITERS (res) = NULL; LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0; LOOP_VINFO_VECTORIZABLE_P (res) = 0; *************** new_loop_vec_info (struct loop *loop) *** 1427,1433 **** stmts in the loop. */ void ! destroy_loop_vec_info (loop_vec_info loop_vinfo) { struct loop *loop; basic_block *bbs; --- 1487,1493 ---- stmts in the loop. */ void ! destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) { struct loop *loop; basic_block *bbs; *************** destroy_loop_vec_info (loop_vec_info loo *** 1443,1448 **** --- 1503,1520 ---- bbs = LOOP_VINFO_BBS (loop_vinfo); nbbs = loop->num_nodes; + if (!clean_stmts) + { + free (LOOP_VINFO_BBS (loop_vinfo)); + free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); + free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); + VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); + + free (loop_vinfo); + loop->aux = NULL; + return; + } + for (j = 0; j < nbbs; j++) { basic_block bb = bbs[j]; *************** get_vectype_for_scalar_type (tree scalar *** 1586,1608 **** enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *dr) { ! tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); enum machine_mode mode = (int) TYPE_MODE (vectype); if (aligned_access_p (dr)) return dr_aligned; /* Possibly unaligned access. */ ! if (DR_IS_READ (dr)) { if (vec_realign_load_optab->handlers[mode].insn_code != CODE_FOR_nothing && (!targetm.vectorize.builtin_mask_for_load || targetm.vectorize.builtin_mask_for_load ())) ! return dr_unaligned_software_pipeline; ! if (movmisalign_optab->handlers[mode].insn_code != CODE_FOR_nothing) - /* Can't software pipeline the loads, but can at least do them. */ return dr_unaligned_supported; } --- 1658,1778 ---- enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *dr) { ! tree stmt = DR_STMT (dr); ! stmt_vec_info stmt_info = vinfo_for_stmt (stmt); ! tree vectype = STMT_VINFO_VECTYPE (stmt_info); enum machine_mode mode = (int) TYPE_MODE (vectype); + struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info)); + bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); + bool invariant_in_outerloop = false; if (aligned_access_p (dr)) return dr_aligned; + if (nested_in_vect_loop) + { + tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); + invariant_in_outerloop = + (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); + } + /* Possibly unaligned access. */ ! ! /* We can choose between using the implicit realignment scheme (generating ! a misaligned_move stmt) and the explicit realignment scheme (generating ! aligned loads with a REALIGN_LOAD). There are two variants to the explicit ! realignment scheme: optimized, and unoptimized. ! We can optimize the realignment only if the step between consecutive ! vector loads is equal to the vector size. Since the vector memory ! accesses advance in steps of VS (Vector Size) in the vectorized loop, it ! is guaranteed that the misalignment amount remains the same throughout the ! execution of the vectorized loop. Therefore, we can create the ! "realignment token" (the permutation mask that is passed to REALIGN_LOAD) ! at the loop preheader. ! ! However, in the case of outer-loop vectorization, when vectorizing a ! memory access in the inner-loop nested within the LOOP that is now being ! vectorized, while it is guaranteed that the misalignment of the ! vectorized memory access will remain the same in different outer-loop ! iterations, it is *not* guaranteed that is will remain the same throughout ! the execution of the inner-loop. This is because the inner-loop advances ! with the original scalar step (and not in steps of VS). If the inner-loop ! step happens to be a multiple of VS, then the misalignment remaines fixed ! and we can use the optimized relaignment scheme. For example: ! ! for (i=0; i; ! vs += va; ! } ! ! On the other hand, when vectorizing the i-loop in the following example ! (that implements the same computation as above): ! ! for (k=0; k<4; k++) ! for (i=0; i; ! vs += va; ! v1 = v2; ! } ! } ! } */ ! if (DR_IS_READ (dr)) { if (vec_realign_load_optab->handlers[mode].insn_code != CODE_FOR_nothing && (!targetm.vectorize.builtin_mask_for_load || targetm.vectorize.builtin_mask_for_load ())) ! { ! if (nested_in_vect_loop ! && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD) ! return dr_explicit_realign; ! else ! return dr_explicit_realign_optimized; ! } if (movmisalign_optab->handlers[mode].insn_code != CODE_FOR_nothing) return dr_unaligned_supported; } *************** vect_is_simple_use (tree operand, loop_v *** 1714,1721 **** { case PHI_NODE: *def = PHI_RESULT (*def_stmt); - gcc_assert (*dt == vect_induction_def || *dt == vect_reduction_def - || *dt == vect_invariant_def); break; case GIMPLE_MODIFY_STMT: --- 1884,1889 ---- *************** supportable_widening_operation (enum tre *** 1756,1761 **** --- 1924,1931 ---- enum tree_code *code1, enum tree_code *code2) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); bool ordered_p; enum machine_mode vec_mode; enum insn_code icode1, icode2; *************** supportable_widening_operation (enum tre *** 1778,1786 **** Some targets can take advantage of this and generate more efficient code. For example, targets like Altivec, that support widen_mult using a sequence of {mult_even,mult_odd} generate the following vectors: ! vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8]. */ ! if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction) ordered_p = false; else ordered_p = true; --- 1948,1962 ---- Some targets can take advantage of this and generate more efficient code. For example, targets like Altivec, that support widen_mult using a sequence of {mult_even,mult_odd} generate the following vectors: ! vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8]. ! ! When vectorizaing outer-loops, we execute the inner-loop sequentially ! (each vectorized inner-loop iteration contributes to VF outer-loop ! iterations in parallel). We therefore don't allow to change the order ! of the computation in the inner-loop during outer-loop vectorization. */ ! if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction ! && !nested_in_vect_loop_p (vect_loop, stmt)) ordered_p = false; else ordered_p = true; *************** reduction_code_for_scalar_code (enum tre *** 2004,2011 **** Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */ tree ! vect_is_simple_reduction (struct loop *loop, tree phi) { edge latch_e = loop_latch_edge (loop); tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); tree def_stmt, def1, def2; --- 2180,2189 ---- Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */ tree ! vect_is_simple_reduction (loop_vec_info loop_info, tree phi) { + struct loop *loop = (bb_for_stmt (phi))->loop_father; + struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); edge latch_e = loop_latch_edge (loop); tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); tree def_stmt, def1, def2; *************** vect_is_simple_reduction (struct loop *l *** 2018,2023 **** --- 2196,2203 ---- imm_use_iterator imm_iter; use_operand_p use_p; + gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop)); + name = PHI_RESULT (phi); nloop_uses = 0; FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) *************** vect_is_simple_reduction (struct loop *l *** 2129,2136 **** return NULL_TREE; } /* CHECKME: check for !flag_finite_math_only too? */ ! if (SCALAR_FLOAT_TYPE_P (type) && !flag_unsafe_math_optimizations) { /* Changing the order of operations changes the semantics. */ if (vect_print_dump_info (REPORT_DETAILS)) --- 2309,2324 ---- return NULL_TREE; } + /* Generally, when vectorizing a reduction we change the order of the + computation. This may change the behavior of the program in some + cases, so we need to check that this is ok. One exception is when + vectorizing an outer-loop: the inner-loop is executed sequentially, + and therefore vectorizing reductions in the inner-loop durint + outer-loop vectorization is safe. */ + /* CHECKME: check for !flag_finite_math_only too? */ ! if (SCALAR_FLOAT_TYPE_P (type) && !flag_unsafe_math_optimizations ! && !nested_in_vect_loop_p (vect_loop, def_stmt)) { /* Changing the order of operations changes the semantics. */ if (vect_print_dump_info (REPORT_DETAILS)) *************** vect_is_simple_reduction (struct loop *l *** 2140,2146 **** } return NULL_TREE; } ! else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)) { /* Changing the order of operations changes the semantics. */ if (vect_print_dump_info (REPORT_DETAILS)) --- 2328,2335 ---- } return NULL_TREE; } ! else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type) ! && !nested_in_vect_loop_p (vect_loop, def_stmt)) { /* Changing the order of operations changes the semantics. */ if (vect_print_dump_info (REPORT_DETAILS)) *************** vect_is_simple_reduction (struct loop *l *** 2169,2181 **** /* Check that one def is the reduction def, defined by PHI, ! the other def is either defined in the loop by a GIMPLE_MODIFY_STMT, ! or it's an induction (defined by some phi node). */ if (def2 == phi && flow_bb_inside_loop_p (loop, bb_for_stmt (def1)) && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT ! || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def)) { if (vect_print_dump_info (REPORT_DETAILS)) { --- 2358,2373 ---- /* Check that one def is the reduction def, defined by PHI, ! the other def is either defined in the loop ("vect_loop_def"), ! or it's an induction (defined by a loop-header phi-node). */ if (def2 == phi && flow_bb_inside_loop_p (loop, bb_for_stmt (def1)) && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT ! || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def ! || (TREE_CODE (def1) == PHI_NODE ! && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def ! && !is_loop_header_bb_p (bb_for_stmt (def1))))) { if (vect_print_dump_info (REPORT_DETAILS)) { *************** vect_is_simple_reduction (struct loop *l *** 2187,2193 **** else if (def1 == phi && flow_bb_inside_loop_p (loop, bb_for_stmt (def2)) && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT ! || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def)) { /* Swap operands (just for simplicity - so that the rest of the code can assume that the reduction variable is always the last (second) --- 2379,2388 ---- else if (def1 == phi && flow_bb_inside_loop_p (loop, bb_for_stmt (def2)) && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT ! || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def ! || (TREE_CODE (def2) == PHI_NODE ! && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def ! && !is_loop_header_bb_p (bb_for_stmt (def2))))) { /* Swap operands (just for simplicity - so that the rest of the code can assume that the reduction variable is always the last (second) *************** vectorize_loops (void) *** 2326,2332 **** if (!loop) continue; loop_vinfo = loop->aux; ! destroy_loop_vec_info (loop_vinfo); loop->aux = NULL; } --- 2521,2527 ---- if (!loop) continue; loop_vinfo = loop->aux; ! destroy_loop_vec_info (loop_vinfo, true); loop->aux = NULL; } Index: tree-vectorizer.h =================================================================== *** tree-vectorizer.h (revision 127202) --- tree-vectorizer.h (working copy) *************** enum operation_type { *** 53,59 **** enum dr_alignment_support { dr_unaligned_unsupported, dr_unaligned_supported, ! dr_unaligned_software_pipeline, dr_aligned }; --- 53,60 ---- enum dr_alignment_support { dr_unaligned_unsupported, dr_unaligned_supported, ! dr_explicit_realign, ! dr_explicit_realign_optimized, dr_aligned }; *************** typedef struct _loop_vec_info { *** 92,100 **** /* The loop basic blocks. */ basic_block *bbs; - /* The loop exit_condition. */ - tree exit_cond; - /* Number of iterations. */ tree num_iters; --- 93,98 ---- *************** typedef struct _loop_vec_info { *** 144,150 **** /* Access Functions. */ #define LOOP_VINFO_LOOP(L) (L)->loop #define LOOP_VINFO_BBS(L) (L)->bbs - #define LOOP_VINFO_EXIT_COND(L) (L)->exit_cond #define LOOP_VINFO_NITERS(L) (L)->num_iters #define LOOP_VINFO_COST_MODEL_MIN_ITERS(L) (L)->min_profitable_iters #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable --- 142,147 ---- *************** typedef struct _loop_vec_info { *** 165,170 **** --- 162,180 ---- #define LOOP_VINFO_NITERS_KNOWN_P(L) \ NITERS_KNOWN_P((L)->num_iters) + static inline loop_vec_info + loop_vec_info_for_loop (struct loop *loop) + { + return (loop_vec_info) loop->aux; + } + + static inline bool + nested_in_vect_loop_p (struct loop *loop, tree stmt) + { + return (loop->inner + && (loop->inner == (bb_for_stmt (stmt))->loop_father)); + } + /*-----------------------------------------------------------------*/ /* Info on vectorized defs. */ /*-----------------------------------------------------------------*/ *************** enum stmt_vec_info_type { *** 180,191 **** induc_vec_info_type, type_promotion_vec_info_type, type_demotion_vec_info_type, ! type_conversion_vec_info_type }; /* Indicates whether/how a variable is used in the loop. */ enum vect_relevant { vect_unused_in_loop = 0, /* defs that feed computations that end up (only) in a reduction. These defs may be used by non-reduction stmts, but eventually, any --- 190,204 ---- induc_vec_info_type, type_promotion_vec_info_type, type_demotion_vec_info_type, ! type_conversion_vec_info_type, ! loop_exit_ctrl_vec_info_type }; /* Indicates whether/how a variable is used in the loop. */ enum vect_relevant { vect_unused_in_loop = 0, + vect_used_in_outer_by_reduction, + vect_used_in_outer, /* defs that feed computations that end up (only) in a reduction. These defs may be used by non-reduction stmts, but eventually, any *************** typedef struct _stmt_vec_info { *** 232,240 **** data-ref (array/pointer/struct access). A GIMPLE stmt is expected to have at most one such data-ref. **/ ! /* Information about the data-ref (access function, etc). */ struct data_reference *data_ref_info; /* Stmt is part of some pattern (computation idiom) */ bool in_pattern_p; --- 245,262 ---- data-ref (array/pointer/struct access). A GIMPLE stmt is expected to have at most one such data-ref. **/ ! /* Information about the data-ref (access function, etc), ! relative to the inner-most containing loop. */ struct data_reference *data_ref_info; + /* Information about the data-ref relative to this loop + nest (the loop that is being considered for vectorization). */ + tree dr_base_address; + tree dr_init; + tree dr_offset; + tree dr_step; + tree dr_aligned_to; + /* Stmt is part of some pattern (computation idiom) */ bool in_pattern_p; *************** typedef struct _stmt_vec_info { *** 293,298 **** --- 315,327 ---- #define STMT_VINFO_VECTYPE(S) (S)->vectype #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info + + #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address + #define STMT_VINFO_DR_INIT(S) (S)->dr_init + #define STMT_VINFO_DR_OFFSET(S) (S)->dr_offset + #define STMT_VINFO_DR_STEP(S) (S)->dr_step + #define STMT_VINFO_DR_ALIGNED_TO(S) (S)->dr_aligned_to + #define STMT_VINFO_IN_PATTERN_P(S) (S)->in_pattern_p #define STMT_VINFO_RELATED_STMT(S) (S)->related_stmt #define STMT_VINFO_SAME_ALIGN_REFS(S) (S)->same_align_refs *************** is_pattern_stmt_p (stmt_vec_info stmt_in *** 403,408 **** --- 432,446 ---- return false; } + static inline bool + is_loop_header_bb_p (basic_block bb) + { + if (bb == (bb->loop_father)->header) + return true; + gcc_assert (EDGE_COUNT (bb->preds) == 1); + return false; + } + /*-----------------------------------------------------------------*/ /* Info on data references alignment. */ /*-----------------------------------------------------------------*/ *************** extern tree get_vectype_for_scalar_type *** 462,468 **** extern bool vect_is_simple_use (tree, loop_vec_info, tree *, tree *, enum vect_def_type *); extern bool vect_is_simple_iv_evolution (unsigned, tree, tree *, tree *); ! extern tree vect_is_simple_reduction (struct loop *, tree); extern bool vect_can_force_dr_alignment_p (tree, unsigned int); extern enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *); --- 500,506 ---- extern bool vect_is_simple_use (tree, loop_vec_info, tree *, tree *, enum vect_def_type *); extern bool vect_is_simple_iv_evolution (unsigned, tree, tree *, tree *); ! extern tree vect_is_simple_reduction (loop_vec_info, tree); extern bool vect_can_force_dr_alignment_p (tree, unsigned int); extern enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *); *************** extern bool supportable_narrowing_operat *** 474,480 **** /* Creation and deletion of loop and stmt info structs. */ extern loop_vec_info new_loop_vec_info (struct loop *loop); ! extern void destroy_loop_vec_info (loop_vec_info); extern stmt_vec_info new_stmt_vec_info (tree stmt, loop_vec_info); --- 512,518 ---- /* Creation and deletion of loop and stmt info structs. */ extern loop_vec_info new_loop_vec_info (struct loop *loop); ! extern void destroy_loop_vec_info (loop_vec_info, bool); extern stmt_vec_info new_stmt_vec_info (tree stmt, loop_vec_info); Index: tree-data-ref.c =================================================================== *** tree-data-ref.c (revision 127202) --- tree-data-ref.c (working copy) *************** dump_ddrs (FILE *file, VEC (ddr_p, heap) *** 489,495 **** /* Expresses EXP as VAR + OFF, where off is a constant. The type of OFF will be ssizetype. */ ! static void split_constant_offset (tree exp, tree *var, tree *off) { tree type = TREE_TYPE (exp), otype; --- 489,495 ---- /* Expresses EXP as VAR + OFF, where off is a constant. The type of OFF will be ssizetype. */ ! void split_constant_offset (tree exp, tree *var, tree *off) { tree type = TREE_TYPE (exp), otype; Index: tree-data-ref.h =================================================================== *** tree-data-ref.h (revision 127202) --- tree-data-ref.h (working copy) *************** index_in_loop_nest (int var, VEC (loop_p *** 388,391 **** --- 388,394 ---- /* In lambda-code.c */ bool lambda_transform_legal_p (lambda_trans_matrix, int, VEC (ddr_p, heap) *); + /* In tree-data-refs.c */ + void split_constant_offset (tree , tree *, tree *); + #endif /* GCC_TREE_DATA_REF_H */ Index: tree-vect-analyze.c =================================================================== *** tree-vect-analyze.c (revision 127202) --- tree-vect-analyze.c (working copy) *************** vect_analyze_operations (loop_vec_info l *** 325,330 **** --- 325,348 ---- print_generic_expr (vect_dump, phi, TDF_SLIM); } + if (! is_loop_header_bb_p (bb)) + { + /* inner-loop loop-closed exit phi in outer-loop vectorization + (i.e. a phi in the tail of the outer-loop). + FORNOW: we currently don't support the case that these phis + are not used in the outerloop, cause this case requires + to actually do something here. */ + if (!STMT_VINFO_RELEVANT_P (stmt_info) + || STMT_VINFO_LIVE_P (stmt_info)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, + "Unsupported loop-closed phi in outer-loop."); + return false; + } + continue; + } + gcc_assert (stmt_info); if (STMT_VINFO_LIVE_P (stmt_info)) *************** vect_analyze_operations (loop_vec_info l *** 398,404 **** break; case vect_reduction_def: ! gcc_assert (relevance == vect_unused_in_loop); break; case vect_induction_def: --- 416,424 ---- break; case vect_reduction_def: ! gcc_assert (relevance == vect_used_in_outer ! || relevance == vect_used_in_outer_by_reduction ! || relevance == vect_unused_in_loop); break; case vect_induction_def: *************** exist_non_indexing_operands_for_use_p (t *** 589,638 **** } ! /* Function vect_analyze_scalar_cycles. ! ! Examine the cross iteration def-use cycles of scalar variables, by ! analyzing the loop (scalar) PHIs; Classify each cycle as one of the ! following: invariant, induction, reduction, unknown. ! ! Some forms of scalar cycles are not yet supported. ! ! Example1: reduction: (unsupported yet) ! ! loop1: ! for (i=0; iheader; tree dumy; VEC(tree,heap) *worklist = VEC_alloc (tree, heap, 64); --- 609,625 ---- } ! /* Function vect_analyze_scalar_cycles_1. ! Examine the cross iteration def-use cycles of scalar variables ! in LOOP. LOOP_VINFO represents the loop that is noe being ! considered for vectorization (can be LOOP, or an outer-loop ! enclosing LOOP). */ static void ! vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) { tree phi; basic_block bb = loop->header; tree dumy; VEC(tree,heap) *worklist = VEC_alloc (tree, heap, 64); *************** vect_analyze_scalar_cycles (loop_vec_inf *** 698,704 **** gcc_assert (is_gimple_reg (SSA_NAME_VAR (def))); gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); ! reduc_stmt = vect_is_simple_reduction (loop, phi); if (reduc_stmt) { if (vect_print_dump_info (REPORT_DETAILS)) --- 685,691 ---- gcc_assert (is_gimple_reg (SSA_NAME_VAR (def))); gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); ! reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi); if (reduc_stmt) { if (vect_print_dump_info (REPORT_DETAILS)) *************** vect_analyze_scalar_cycles (loop_vec_inf *** 717,722 **** --- 704,751 ---- } + /* Function vect_analyze_scalar_cycles. + + Examine the cross iteration def-use cycles of scalar variables, by + analyzing the loop-header PHIs of scalar variables; Classify each + cycle as one of the following: invariant, induction, reduction, unknown. + We do that for the loop represented by LOOP_VINFO, and also to its + inner-loop, if exists. + Examples for scalar cycles: + + Example1: reduction: + + loop1: + for (i=0; iinner) + vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); + } + + /* Function vect_insert_into_interleaving_chain. Insert DRA into the interleaving chain of DRB according to DRA's INIT. */ *************** vect_compute_data_ref_alignment (struct *** 1164,1169 **** --- 1193,1200 ---- { tree stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree ref = DR_REF (dr); tree vectype; tree base, base_addr; *************** vect_compute_data_ref_alignment (struct *** 1180,1192 **** misalign = DR_INIT (dr); aligned_to = DR_ALIGNED_TO (dr); base_addr = DR_BASE_ADDRESS (dr); base = build_fold_indirect_ref (base_addr); vectype = STMT_VINFO_VECTYPE (stmt_info); alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT); ! if (tree_int_cst_compare (aligned_to, alignment) < 0) { ! if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "Unknown alignment for access: "); print_generic_expr (vect_dump, base, TDF_SLIM); --- 1211,1252 ---- misalign = DR_INIT (dr); aligned_to = DR_ALIGNED_TO (dr); base_addr = DR_BASE_ADDRESS (dr); + + /* In case the dataref is in an inner-loop of the loop that is being + vectorized (LOOP), we use the base and misalignment information + relative to the outer-loop (LOOP). This is ok only if the misalignment + stays the same throughout the execution of the inner-loop, which is why + we have to check that the stride of the dataref in the inner-loop evenly + divides by the vector size. */ + if (nested_in_vect_loop_p (loop, stmt)) + { + tree step = DR_STEP (dr); + HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); + + if (dr_step % UNITS_PER_SIMD_WORD == 0) + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "inner step divides the vector-size."); + misalign = STMT_VINFO_DR_INIT (stmt_info); + aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); + base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); + } + else + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "inner step doesn't divide the vector-size."); + misalign = NULL_TREE; + } + } + base = build_fold_indirect_ref (base_addr); vectype = STMT_VINFO_VECTYPE (stmt_info); alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT); ! if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0) ! || !misalign) { ! if (vect_print_dump_info (REPORT_ALIGNMENT)) { fprintf (vect_dump, "Unknown alignment for access: "); print_generic_expr (vect_dump, base, TDF_SLIM); *************** vect_enhance_data_refs_alignment (loop_v *** 1722,1728 **** 4) all misaligned data refs with a known misalignment are supported, and 5) the number of runtime alignment checks is within reason. */ ! do_versioning = flag_tree_vect_loop_version && (!optimize_size); if (do_versioning) { --- 1782,1791 ---- 4) all misaligned data refs with a known misalignment are supported, and 5) the number of runtime alignment checks is within reason. */ ! do_versioning = ! flag_tree_vect_loop_version ! && (!optimize_size) ! && (!loop->inner); if (do_versioning) { *************** static bool *** 1855,1874 **** vect_analyze_data_ref_access (struct data_reference *dr) { tree step = DR_STEP (dr); - HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); tree scalar_type = TREE_TYPE (DR_REF (dr)); HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); tree stmt = DR_STMT (dr); ! /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the ! interleaving group (including gaps). */ ! HOST_WIDE_INT stride = dr_step / type_size; ! if (!step) { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "bad data-ref access"); ! return false; } /* Consecutive? */ if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))) --- 1918,1956 ---- vect_analyze_data_ref_access (struct data_reference *dr) { tree step = DR_STEP (dr); tree scalar_type = TREE_TYPE (DR_REF (dr)); HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); tree stmt = DR_STMT (dr); ! stmt_vec_info stmt_info = vinfo_for_stmt (stmt); ! loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); ! struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ! HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); ! HOST_WIDE_INT stride; ! /* Don't allow invariant accesses. */ ! if (dr_step == 0) ! return false; ! ! if (nested_in_vect_loop_p (loop, stmt)) { ! /* For the rest of the analysis we use the outer-loop step. */ ! step = STMT_VINFO_DR_STEP (stmt_info); ! dr_step = TREE_INT_CST_LOW (step); ! ! if (dr_step == 0) ! { ! if (vect_print_dump_info (REPORT_ALIGNMENT)) ! fprintf (vect_dump, "zero step in outer loop."); ! if (DR_IS_READ (dr)) ! return true; ! else ! return false; ! } } + + /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the + interleaving group (including gaps). */ + stride = dr_step / type_size; /* Consecutive? */ if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))) *************** vect_analyze_data_ref_access (struct dat *** 1878,1883 **** --- 1960,1972 ---- return true; } + if (nested_in_vect_loop_p (loop, stmt)) + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "strided access in outer loop."); + return false; + } + /* Not consecutive access is possible only if it is a part of interleaving. */ if (!DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt))) { *************** vect_analyze_data_refs (loop_vec_info lo *** 2105,2110 **** --- 2194,2201 ---- { tree stmt; stmt_vec_info stmt_info; + basic_block bb; + tree base, offset, init; if (!dr || !DR_REF (dr)) { *************** vect_analyze_data_refs (loop_vec_info lo *** 2112,2137 **** fprintf (vect_dump, "not vectorized: unhandled data-ref "); return false; } ! ! /* Update DR field in stmt_vec_info struct. */ stmt = DR_STMT (dr); stmt_info = vinfo_for_stmt (stmt); - if (STMT_VINFO_DATA_REF (stmt_info)) - { - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) - { - fprintf (vect_dump, - "not vectorized: more than one data ref in stmt: "); - print_generic_expr (vect_dump, stmt, TDF_SLIM); - } - return false; - } - STMT_VINFO_DATA_REF (stmt_info) = dr; - /* Check that analysis of the data-ref succeeded. */ if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) ! || !DR_STEP (dr)) { if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) { --- 2203,2215 ---- fprintf (vect_dump, "not vectorized: unhandled data-ref "); return false; } ! stmt = DR_STMT (dr); stmt_info = vinfo_for_stmt (stmt); /* Check that analysis of the data-ref succeeded. */ if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) ! || !DR_STEP (dr)) { if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) { *************** vect_analyze_data_refs (loop_vec_info lo *** 2158,2164 **** } return false; } ! /* Set vectype for STMT. */ scalar_type = TREE_TYPE (DR_REF (dr)); STMT_VINFO_VECTYPE (stmt_info) = --- 2236,2362 ---- } return false; } ! ! base = unshare_expr (DR_BASE_ADDRESS (dr)); ! offset = unshare_expr (DR_OFFSET (dr)); ! init = unshare_expr (DR_INIT (dr)); ! ! /* Update DR field in stmt_vec_info struct. */ ! bb = bb_for_stmt (stmt); ! ! /* If the dataref is in an inner-loop of the loop that is considered for ! for vectorization, we also want to analyze the access relative to ! the outer-loop (DR contains information only relative to the ! inner-most enclosing loop). We do that by building a reference to the ! first location accessed by the inner-loop, and analyze it relative to ! the outer-loop. */ ! if (nested_in_vect_loop_p (loop, stmt)) ! { ! tree outer_step, outer_base, outer_init; ! HOST_WIDE_INT pbitsize, pbitpos; ! tree poffset; ! enum machine_mode pmode; ! int punsignedp, pvolatilep; ! affine_iv base_iv, offset_iv; ! tree dinit; ! ! /* Build a reference to the first location accessed by the ! inner-loop: *(BASE+INNER). (The first location is actually ! BASE+INNER+OFFSET, but we add OFFSET separately later. */ ! tree inner_base = build_fold_indirect_ref ! (fold_build2 (PLUS_EXPR, TREE_TYPE (base), base, init)); ! ! if (vect_print_dump_info (REPORT_DETAILS)) ! { ! fprintf (dump_file, "analyze in outer-loop: "); ! print_generic_expr (dump_file, inner_base, TDF_SLIM); ! } ! ! outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, ! &poffset, &pmode, &punsignedp, &pvolatilep, false); ! gcc_assert (outer_base != NULL_TREE); ! ! if (pbitpos % BITS_PER_UNIT != 0) ! { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (dump_file, "failed: bit offset alignment.\n"); ! return false; ! } ! ! outer_base = build_fold_addr_expr (outer_base); ! if (!simple_iv (loop, stmt, outer_base, &base_iv, false)) ! { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (dump_file, "failed: evolution of base is not affine.\n"); ! return false; ! } ! ! if (offset) ! { ! if (poffset) ! poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, poffset); ! else ! poffset = offset; ! } ! ! if (!poffset) ! { ! offset_iv.base = ssize_int (0); ! offset_iv.step = ssize_int (0); ! } ! else if (!simple_iv (loop, stmt, poffset, &offset_iv, false)) ! { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (dump_file, "evolution of offset is not affine.\n"); ! return false; ! } ! ! outer_init = ssize_int (pbitpos / BITS_PER_UNIT); ! split_constant_offset (base_iv.base, &base_iv.base, &dinit); ! outer_init = size_binop (PLUS_EXPR, outer_init, dinit); ! split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); ! outer_init = size_binop (PLUS_EXPR, outer_init, dinit); ! ! outer_step = size_binop (PLUS_EXPR, ! fold_convert (ssizetype, base_iv.step), ! fold_convert (ssizetype, offset_iv.step)); ! ! STMT_VINFO_DR_STEP (stmt_info) = outer_step; ! /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ ! STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; ! STMT_VINFO_DR_INIT (stmt_info) = outer_init; ! STMT_VINFO_DR_OFFSET (stmt_info) = ! fold_convert (ssizetype, offset_iv.base); ! STMT_VINFO_DR_ALIGNED_TO (stmt_info) = ! size_int (highest_pow2_factor (offset_iv.base)); ! ! if (dump_file && (dump_flags & TDF_DETAILS)) ! { ! fprintf (dump_file, "\touter base_address: "); ! print_generic_expr (dump_file, STMT_VINFO_DR_BASE_ADDRESS (stmt_info), TDF_SLIM); ! fprintf (dump_file, "\n\touter offset from base address: "); ! print_generic_expr (dump_file, STMT_VINFO_DR_OFFSET (stmt_info), TDF_SLIM); ! fprintf (dump_file, "\n\touter constant offset from base address: "); ! print_generic_expr (dump_file, STMT_VINFO_DR_INIT (stmt_info), TDF_SLIM); ! fprintf (dump_file, "\n\touter step: "); ! print_generic_expr (dump_file, STMT_VINFO_DR_STEP (stmt_info), TDF_SLIM); ! fprintf (dump_file, "\n\touter aligned to: "); ! print_generic_expr (dump_file, STMT_VINFO_DR_ALIGNED_TO (stmt_info), TDF_SLIM); ! } ! } ! ! if (STMT_VINFO_DATA_REF (stmt_info)) ! { ! if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) ! { ! fprintf (vect_dump, ! "not vectorized: more than one data ref in stmt: "); ! print_generic_expr (vect_dump, stmt, TDF_SLIM); ! } ! return false; ! } ! STMT_VINFO_DATA_REF (stmt_info) = dr; ! /* Set vectype for STMT. */ scalar_type = TREE_TYPE (DR_REF (dr)); STMT_VINFO_VECTYPE (stmt_info) = *************** vect_mark_relevant (VEC(tree,heap) **wor *** 2204,2214 **** /* This is the last stmt in a sequence that was detected as a pattern that can potentially be vectorized. Don't mark the stmt ! as relevant/live because it's not going to vectorized. Instead mark the pattern-stmt that replaces it. */ if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live."); - pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info); stmt_info = vinfo_for_stmt (pattern_stmt); gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt); save_relevant = STMT_VINFO_RELEVANT (stmt_info); --- 2402,2414 ---- /* This is the last stmt in a sequence that was detected as a pattern that can potentially be vectorized. Don't mark the stmt ! as relevant/live because it's not going to be vectorized. Instead mark the pattern-stmt that replaces it. */ + + pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live."); stmt_info = vinfo_for_stmt (pattern_stmt); gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt); save_relevant = STMT_VINFO_RELEVANT (stmt_info); *************** vect_stmt_relevant_p (tree stmt, loop_ve *** 2258,2264 **** *live_p = false; /* cond stmt other than loop exit cond. */ ! if (is_ctrl_stmt (stmt) && (stmt != LOOP_VINFO_EXIT_COND (loop_vinfo))) *relevant = vect_used_in_loop; /* changing memory. */ --- 2458,2465 ---- *live_p = false; /* cond stmt other than loop exit cond. */ ! if (is_ctrl_stmt (stmt) ! && STMT_VINFO_TYPE (vinfo_for_stmt (stmt)) != loop_exit_ctrl_vec_info_type) *relevant = vect_used_in_loop; /* changing memory. */ *************** vect_stmt_relevant_p (tree stmt, loop_ve *** 2315,2320 **** --- 2516,2523 ---- of the respective DEF_STMT is left unchanged. - case 2: If STMT is a reduction phi and DEF_STMT is a reduction stmt, we skip DEF_STMT cause it had already been processed. + - case 3: If DEF_STMT and STMT are in different nests, then "relevant" will + be modified accordingly. Return true if everything is as expected. Return false otherwise. */ *************** process_use (tree stmt, tree use, loop_v *** 2325,2331 **** struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); stmt_vec_info dstmt_vinfo; ! basic_block def_bb; tree def, def_stmt; enum vect_def_type dt; --- 2528,2534 ---- struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); stmt_vec_info dstmt_vinfo; ! basic_block bb, def_bb; tree def, def_stmt; enum vect_def_type dt; *************** process_use (tree stmt, tree use, loop_v *** 2346,2362 **** def_bb = bb_for_stmt (def_stmt); if (!flow_bb_inside_loop_p (loop, def_bb)) ! return true; ! /* case 2: A reduction phi defining a reduction stmt (DEF_STMT). DEF_STMT ! must have already been processed, so we just check that everything is as ! expected, and we are done. */ dstmt_vinfo = vinfo_for_stmt (def_stmt); if (TREE_CODE (stmt) == PHI_NODE && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def && TREE_CODE (def_stmt) != PHI_NODE ! && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def) { if (STMT_VINFO_IN_PATTERN_P (dstmt_vinfo)) dstmt_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (dstmt_vinfo)); gcc_assert (STMT_VINFO_RELEVANT (dstmt_vinfo) < vect_used_by_reduction); --- 2549,2575 ---- def_bb = bb_for_stmt (def_stmt); if (!flow_bb_inside_loop_p (loop, def_bb)) ! { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "def_stmt is out of loop."); ! return true; ! } ! /* case 2: A reduction phi (STMT) defined by a reduction stmt (DEF_STMT). ! DEF_STMT must have already been processed, because this should be the ! only way that STMT, which is a reduction-phi, was put in the worklist, ! as there should be no other uses for DEF_STMT in the loop. So we just ! check that everything is as expected, and we are done. */ dstmt_vinfo = vinfo_for_stmt (def_stmt); + bb = bb_for_stmt (stmt); if (TREE_CODE (stmt) == PHI_NODE && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def && TREE_CODE (def_stmt) != PHI_NODE ! && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def ! && bb->loop_father == def_bb->loop_father) { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "reduc-stmt defining reduc-phi in the same nest."); if (STMT_VINFO_IN_PATTERN_P (dstmt_vinfo)) dstmt_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (dstmt_vinfo)); gcc_assert (STMT_VINFO_RELEVANT (dstmt_vinfo) < vect_used_by_reduction); *************** process_use (tree stmt, tree use, loop_v *** 2365,2370 **** --- 2578,2650 ---- return true; } + /* case 3a: outer-loop stmt defining an inner-loop stmt: + outer-loop-header-bb: + d = def_stmt + inner-loop: + stmt # use (d) + outer-loop-tail-bb: + ... */ + if (flow_loop_nested_p (def_bb->loop_father, bb->loop_father)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "outer-loop def-stmt defining inner-loop stmt."); + switch (relevant) + { + case vect_unused_in_loop: + relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) ? + vect_used_by_reduction : vect_unused_in_loop; + break; + case vect_used_in_outer_by_reduction: + relevant = vect_used_by_reduction; + break; + case vect_used_in_outer: + relevant = vect_used_in_loop; + break; + case vect_used_by_reduction: + case vect_used_in_loop: + break; + + default: + gcc_unreachable (); + } + } + + /* case 3b: inner-loop stmt defining an outer-loop stmt: + outer-loop-header-bb: + ... + inner-loop: + d = def_stmt + outer-loop-tail-bb: + stmt # use (d) */ + else if (flow_loop_nested_p (bb->loop_father, def_bb->loop_father)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "inner-loop def-stmt defining outer-loop stmt."); + switch (relevant) + { + case vect_unused_in_loop: + relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) ? + vect_used_in_outer_by_reduction : vect_unused_in_loop; + break; + + case vect_used_in_outer_by_reduction: + case vect_used_in_outer: + break; + + case vect_used_by_reduction: + relevant = vect_used_in_outer_by_reduction; + break; + + case vect_used_in_loop: + relevant = vect_used_in_outer; + break; + + default: + gcc_unreachable (); + } + } + vect_mark_relevant (worklist, def_stmt, relevant, live_p); return true; } *************** vect_mark_stmts_to_be_vectorized (loop_v *** 2473,2497 **** identify stmts that are used solely by a reduction, and therefore the order of the results that they produce does not have to be kept. ! Reduction phis are expected to be used by a reduction stmt; Other ! reduction stmts are expected to be unused in the loop. These are the ! expected values of "relevant" for reduction phis/stmts in the loop: relevance: phi stmt vect_unused_in_loop ok vect_used_by_reduction ok vect_used_in_loop */ if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) { ! switch (relevant) { case vect_unused_in_loop: gcc_assert (TREE_CODE (stmt) != PHI_NODE); break; case vect_used_by_reduction: if (TREE_CODE (stmt) == PHI_NODE) break; case vect_used_in_loop: default: if (vect_print_dump_info (REPORT_DETAILS)) --- 2753,2790 ---- identify stmts that are used solely by a reduction, and therefore the order of the results that they produce does not have to be kept. ! Reduction phis are expected to be used by a reduction stmt, or by ! in an outer loop; Other reduction stmts are expected to be ! in the loop, and possibly used by a stmt in an outer loop. ! Here are the expected values of "relevant" for reduction phis/stmts: relevance: phi stmt vect_unused_in_loop ok + vect_used_in_outer_by_reduction ok ok + vect_used_in_outer ok ok vect_used_by_reduction ok vect_used_in_loop */ if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) { ! enum vect_relevant tmp_relevant = relevant; ! switch (tmp_relevant) { case vect_unused_in_loop: gcc_assert (TREE_CODE (stmt) != PHI_NODE); + relevant = vect_used_by_reduction; + break; + + case vect_used_in_outer_by_reduction: + case vect_used_in_outer: + gcc_assert (TREE_CODE (stmt) != WIDEN_SUM_EXPR + && TREE_CODE (stmt) != DOT_PROD_EXPR); break; + case vect_used_by_reduction: if (TREE_CODE (stmt) == PHI_NODE) break; + /* fall through */ case vect_used_in_loop: default: if (vect_print_dump_info (REPORT_DETAILS)) *************** vect_mark_stmts_to_be_vectorized (loop_v *** 2499,2505 **** VEC_free (tree, heap, worklist); return false; } - relevant = vect_used_by_reduction; live_p = false; } --- 2792,2797 ---- *************** vect_get_loop_niters (struct loop *loop, *** 2641,2651 **** } /* Function vect_analyze_loop_form. ! Verify the following restrictions (some may be relaxed in the future): ! - it's an inner-most loop ! - number of BBs = 2 (which are the loop header and the latch) - the loop has a pre-header - the loop has a single entry and exit - the loop exit condition is simple enough, and the number of iterations --- 2933,2971 ---- } + /* Function vect_analyze_loop_1. + + Apply a set of analyses on LOOP, and create a loop_vec_info struct + for it. The different analyses will record information in the + loop_vec_info struct. This is a subset of the analyses applied in + vect_analyze_loop, to be applied on an inner-loop nested in the loop + that is now considered for (outer-loop) vectorization. */ + + static loop_vec_info + vect_analyze_loop_1 (struct loop *loop) + { + loop_vec_info loop_vinfo; + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "===== analyze_loop_nest_1 ====="); + + /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */ + + loop_vinfo = vect_analyze_loop_form (loop); + if (!loop_vinfo) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "bad inner-loop form."); + return NULL; + } + + return loop_vinfo; + } + + /* Function vect_analyze_loop_form. ! Verify that certain CFG restrictions hold, including: - the loop has a pre-header - the loop has a single entry and exit - the loop exit condition is simple enough, and the number of iterations *************** vect_analyze_loop_form (struct loop *loo *** 2657,2687 **** loop_vec_info loop_vinfo; tree loop_cond; tree number_of_iterations = NULL; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vect_analyze_loop_form ==="); ! if (loop->inner) { ! if (vect_print_dump_info (REPORT_OUTER_LOOPS)) ! fprintf (vect_dump, "not vectorized: nested loop."); return NULL; } if (!single_exit (loop) - || loop->num_nodes != 2 || EDGE_COUNT (loop->header->preds) != 2) { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) { if (!single_exit (loop)) fprintf (vect_dump, "not vectorized: multiple exits."); - else if (loop->num_nodes != 2) - fprintf (vect_dump, "not vectorized: too many BBs in loop."); else if (EDGE_COUNT (loop->header->preds) != 2) fprintf (vect_dump, "not vectorized: too many incoming edges."); } ! return NULL; } --- 2977,3100 ---- loop_vec_info loop_vinfo; tree loop_cond; tree number_of_iterations = NULL; + loop_vec_info inner_loop_vinfo = NULL; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vect_analyze_loop_form ==="); ! /* Different restrictions apply when we are considering an inner-most loop, ! vs. an outer (nested) loop. ! (FORNOW. May want to relax some of these restrictions in the future). */ ! ! if (!loop->inner) ! { ! /* Inner-most loop. We currently require that the number of BBs is ! exactly 2 (the header and latch). Vectorizable inner-most loops ! look like this: ! ! (pre-header) ! | ! header <--------+ ! | | | ! | +--> latch --+ ! | ! (exit-bb) */ ! ! if (loop->num_nodes != 2) ! { ! if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) ! fprintf (vect_dump, "not vectorized: too many BBs in loop."); ! return NULL; ! } ! ! if (empty_block_p (loop->header)) { ! if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) ! fprintf (vect_dump, "not vectorized: empty loop."); return NULL; } + } + else + { + struct loop *innerloop = loop->inner; + edge backedge, entryedge; + + /* Nested loop. We currently require that the loop is doubly-nested, + contains a single inner loop, and the number of BBs is exactly 5. + Vectorizable outer-loops look like this: + + (pre-header) + | + header <---+ + | | + inner-loop | + | | + tail ------+ + | + (exit-bb) + + The inner-loop has the properties expected of inner-most loops + as described above. */ + + if ((loop->inner)->inner || (loop->inner)->next) + { + if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) + fprintf (vect_dump, "not vectorized: multiple nested loops."); + return NULL; + } + + /* Analyze the inner-loop. */ + inner_loop_vinfo = vect_analyze_loop_1 (loop->inner); + if (!inner_loop_vinfo) + { + if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) + fprintf (vect_dump, "not vectorized: Bad inner loop."); + return NULL; + } + + if (loop->num_nodes != 5) + { + if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) + fprintf (vect_dump, "not vectorized: too many BBs in loop."); + destroy_loop_vec_info (inner_loop_vinfo, true); + return NULL; + } + + gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2); + backedge = EDGE_PRED (innerloop->header, 1); + entryedge = EDGE_PRED (innerloop->header, 0); + if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch) + { + backedge = EDGE_PRED (innerloop->header, 0); + entryedge = EDGE_PRED (innerloop->header, 1); + } + + if (entryedge->src != loop->header + || !single_exit (innerloop) + || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) + { + if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) + fprintf (vect_dump, "not vectorized: unsupported outerloop form."); + destroy_loop_vec_info (inner_loop_vinfo, true); + return NULL; + } + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Considering outer-loop vectorization."); + } if (!single_exit (loop) || EDGE_COUNT (loop->header->preds) != 2) { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) { if (!single_exit (loop)) fprintf (vect_dump, "not vectorized: multiple exits."); else if (EDGE_COUNT (loop->header->preds) != 2) fprintf (vect_dump, "not vectorized: too many incoming edges."); } ! if (inner_loop_vinfo) ! destroy_loop_vec_info (inner_loop_vinfo, true); return NULL; } *************** vect_analyze_loop_form (struct loop *loo *** 2694,2699 **** --- 3107,3114 ---- { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "not vectorized: unexpected loop form."); + if (inner_loop_vinfo) + destroy_loop_vec_info (inner_loop_vinfo, true); return NULL; } *************** vect_analyze_loop_form (struct loop *loo *** 2711,2732 **** { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "not vectorized: abnormal loop exit edge."); return NULL; } } - if (empty_block_p (loop->header)) - { - if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) - fprintf (vect_dump, "not vectorized: empty loop."); - return NULL; - } - loop_cond = vect_get_loop_niters (loop, &number_of_iterations); if (!loop_cond) { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "not vectorized: complicated exit condition."); return NULL; } --- 3126,3144 ---- { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "not vectorized: abnormal loop exit edge."); + if (inner_loop_vinfo) + destroy_loop_vec_info (inner_loop_vinfo, true); return NULL; } } loop_cond = vect_get_loop_niters (loop, &number_of_iterations); if (!loop_cond) { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "not vectorized: complicated exit condition."); + if (inner_loop_vinfo) + destroy_loop_vec_info (inner_loop_vinfo, true); return NULL; } *************** vect_analyze_loop_form (struct loop *loo *** 2735,2740 **** --- 3147,3154 ---- if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "not vectorized: number of iterations cannot be computed."); + if (inner_loop_vinfo) + destroy_loop_vec_info (inner_loop_vinfo, true); return NULL; } *************** vect_analyze_loop_form (struct loop *loo *** 2742,2748 **** { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "Infinite number of iterations."); ! return false; } if (!NITERS_KNOWN_P (number_of_iterations)) --- 3156,3164 ---- { if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) fprintf (vect_dump, "Infinite number of iterations."); ! if (inner_loop_vinfo) ! destroy_loop_vec_info (inner_loop_vinfo, true); ! return NULL; } if (!NITERS_KNOWN_P (number_of_iterations)) *************** vect_analyze_loop_form (struct loop *loo *** 2757,2768 **** { if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) fprintf (vect_dump, "not vectorized: number of iterations = 0."); return NULL; } loop_vinfo = new_loop_vec_info (loop); LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; ! LOOP_VINFO_EXIT_COND (loop_vinfo) = loop_cond; gcc_assert (!loop->aux); loop->aux = loop_vinfo; --- 3173,3191 ---- { if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) fprintf (vect_dump, "not vectorized: number of iterations = 0."); + if (inner_loop_vinfo) + destroy_loop_vec_info (inner_loop_vinfo, false); return NULL; } loop_vinfo = new_loop_vec_info (loop); LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; ! ! STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type; ! ! /* CHECKME: May want to keep it around it in the future. */ ! if (inner_loop_vinfo) ! destroy_loop_vec_info (inner_loop_vinfo, false); gcc_assert (!loop->aux); loop->aux = loop_vinfo; *************** vect_analyze_loop (struct loop *loop) *** 2784,2789 **** --- 3207,3221 ---- if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "===== analyze_loop_nest ====="); + if (loop_outer (loop) + && loop_vec_info_for_loop (loop_outer (loop)) + && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "outer-loop already vectorized."); + return NULL; + } + /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */ loop_vinfo = vect_analyze_loop_form (loop); *************** vect_analyze_loop (struct loop *loop) *** 2805,2811 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data references."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3237,3243 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data references."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } *************** vect_analyze_loop (struct loop *loop) *** 2823,2829 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "unexpected pattern."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3255,3261 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "unexpected pattern."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } *************** vect_analyze_loop (struct loop *loop) *** 2835,2841 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data alignment."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3267,3273 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data alignment."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } *************** vect_analyze_loop (struct loop *loop) *** 2844,2850 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "can't determine vectorization factor."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3276,3282 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "can't determine vectorization factor."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } *************** vect_analyze_loop (struct loop *loop) *** 2856,2862 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data dependence."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3288,3294 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data dependence."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } *************** vect_analyze_loop (struct loop *loop) *** 2868,2874 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data access."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3300,3306 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data access."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } *************** vect_analyze_loop (struct loop *loop) *** 2880,2886 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data alignment."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3312,3318 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad data alignment."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } *************** vect_analyze_loop (struct loop *loop) *** 2892,2898 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad operation or unsupported loop bound."); ! destroy_loop_vec_info (loop_vinfo); return NULL; } --- 3324,3330 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "bad operation or unsupported loop bound."); ! destroy_loop_vec_info (loop_vinfo, true); return NULL; } Index: tree-vect-patterns.c =================================================================== *** tree-vect-patterns.c (revision 127202) --- tree-vect-patterns.c (working copy) *************** widened_name_p (tree name, tree use_stmt *** 148,154 **** * Return value: A new stmt that will be used to replace the sequence of stmts that constitute the pattern. In this case it will be: WIDEN_DOT_PRODUCT ! */ static tree vect_recog_dot_prod_pattern (tree last_stmt, tree *type_in, tree *type_out) --- 148,161 ---- * Return value: A new stmt that will be used to replace the sequence of stmts that constitute the pattern. In this case it will be: WIDEN_DOT_PRODUCT ! ! Note: The dot-prod idiom is a widening reduction pattern that is ! vectorized without preserving all the intermediate results. It ! produces only N/2 (widened) results (by summing up pairs of ! intermediate results) rather than all N results. Therefore, we ! cannot allow this pattern when we want to get all the results and in ! the correct order (as is the case when this computation is in an ! inner-loop nested in an outer-loop that us being vectorized). */ static tree vect_recog_dot_prod_pattern (tree last_stmt, tree *type_in, tree *type_out) *************** vect_recog_dot_prod_pattern (tree last_s *** 160,165 **** --- 167,174 ---- tree type, half_type; tree pattern_expr; tree prod_type; + loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_info); if (TREE_CODE (last_stmt) != GIMPLE_MODIFY_STMT) return NULL; *************** vect_recog_dot_prod_pattern (tree last_s *** 242,247 **** --- 251,260 ---- gcc_assert (stmt_vinfo); if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_loop_def) return NULL; + /* FORNOW. Can continue analyzing the def-use chain when this stmt in a phi + inside the loop (in case we are analyzing an outer-loop). */ + if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) + return NULL; expr = GIMPLE_STMT_OPERAND (stmt, 1); if (TREE_CODE (expr) != MULT_EXPR) return NULL; *************** vect_recog_dot_prod_pattern (tree last_s *** 295,300 **** --- 308,323 ---- fprintf (vect_dump, "vect_recog_dot_prod_pattern: detected: "); print_generic_expr (vect_dump, pattern_expr, TDF_SLIM); } + + /* We don't allow changing the order of the computation in the inner-loop + when doing outer-loop vectorization. */ + if (nested_in_vect_loop_p (loop, last_stmt)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "vect_recog_dot_prod_pattern: not allowed."); + return NULL; + } + return pattern_expr; } *************** vect_recog_pow_pattern (tree last_stmt, *** 521,527 **** * Return value: A new stmt that will be used to replace the sequence of stmts that constitute the pattern. In this case it will be: WIDEN_SUM ! */ static tree vect_recog_widen_sum_pattern (tree last_stmt, tree *type_in, tree *type_out) --- 544,557 ---- * Return value: A new stmt that will be used to replace the sequence of stmts that constitute the pattern. In this case it will be: WIDEN_SUM ! ! Note: The widneing-sum idiom is a widening reduction pattern that is ! vectorized without preserving all the intermediate results. It ! produces only N/2 (widened) results (by summing up pairs of ! intermediate results) rather than all N results. Therefore, we ! cannot allow this pattern when we want to get all the results and in ! the correct order (as is the case when this computation is in an ! inner-loop nested in an outer-loop that us being vectorized). */ static tree vect_recog_widen_sum_pattern (tree last_stmt, tree *type_in, tree *type_out) *************** vect_recog_widen_sum_pattern (tree last_ *** 531,536 **** --- 561,568 ---- stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt); tree type, half_type; tree pattern_expr; + loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_info); if (TREE_CODE (last_stmt) != GIMPLE_MODIFY_STMT) return NULL; *************** vect_recog_widen_sum_pattern (tree last_ *** 580,585 **** --- 612,627 ---- fprintf (vect_dump, "vect_recog_widen_sum_pattern: detected: "); print_generic_expr (vect_dump, pattern_expr, TDF_SLIM); } + + /* We don't allow changing the order of the computation in the inner-loop + when doing outer-loop vectorization. */ + if (nested_in_vect_loop_p (loop, last_stmt)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "vect_recog_widen_sum_pattern: not allowed."); + return NULL; + } + return pattern_expr; } Index: tree-vect-transform.c =================================================================== *** tree-vect-transform.c (revision 127202) --- tree-vect-transform.c (working copy) *************** along with GCC; see the file COPYING3. *** 49,62 **** static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr ! (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree); ! static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree); ! static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *); static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); static tree vect_get_vec_def_for_operand (tree, tree, tree *); ! static tree vect_init_vector (tree, tree, tree); static void vect_finish_stmt_generation ! (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); static bool vect_is_simple_cond (tree, loop_vec_info); static void update_vuses_to_preheader (tree, struct loop*); static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); --- 49,62 ---- static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr ! (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *); ! static tree vect_create_addr_base_for_vector_ref ! (tree, tree *, tree, struct loop *); static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); static tree vect_get_vec_def_for_operand (tree, tree, tree *); ! static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *); static void vect_finish_stmt_generation ! (tree stmt, tree vec_stmt, block_stmt_iterator *); static bool vect_is_simple_cond (tree, loop_vec_info); static void update_vuses_to_preheader (tree, struct loop*); static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); *************** vect_estimate_min_profitable_iters (loop *** 125,130 **** --- 125,131 ---- basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); int nbbs = loop->num_nodes; int byte_misalign; + int innerloop_iters, factor; /* Cost model disabled. */ if (!flag_vect_cost_model) *************** vect_estimate_min_profitable_iters (loop *** 153,163 **** --- 154,173 ---- TODO: Consider assigning different costs to different scalar statements. */ + /* FORNOW. */ + if (loop->inner) + innerloop_iters = 50; /* FIXME */ + for (i = 0; i < nbbs; i++) { block_stmt_iterator si; basic_block bb = bbs[i]; + if (bb->loop_father == loop->inner) + factor = innerloop_iters; + else + factor = 1; + for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si)) { tree stmt = bsi_stmt (si); *************** vect_estimate_min_profitable_iters (loop *** 165,172 **** if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) continue; ! scalar_single_iter_cost += cost_for_stmt (stmt); ! vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info); vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); } } --- 175,184 ---- if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) continue; ! scalar_single_iter_cost += cost_for_stmt (stmt) * factor; ! vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; ! /* FIXME: for stmts in the inner-loop in outer-loop vectorization, ! some of the "outside" costs are generated inside the outer-loop. */ vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); } } *************** vect_model_load_cost (stmt_vec_info stmt *** 598,604 **** break; } ! case dr_unaligned_software_pipeline: { int outer_cost = 0; --- 610,628 ---- break; } ! case dr_explicit_realign: ! { ! inner_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); ! ! /* FIXME: If the misalignment remains fixed across the iterations of ! the containing loop, the following cost should be added to the ! outside costs. */ ! if (targetm.vectorize.builtin_mask_for_load) ! inner_cost += TARG_VEC_STMT_COST; ! ! break; ! } ! case dr_explicit_realign_optimized: { int outer_cost = 0; *************** vect_get_new_vect_var (tree type, enum v *** 695,700 **** --- 719,737 ---- STMT: The statement containing the data reference. NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. OFFSET: Optional. If supplied, it is be added to the initial address. + LOOP: Specify relative to which loop-nest should the address be computed. + For example, when the dataref is in an inner-loop nested in an + outer-loop that is now being vectorized, LOOP can be either the + outer-loop, or the inner-loop. The first memory location accessed + by the following dataref ('in' points to short): + + for (i=0; iloop_father; ! tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); ! tree base_name; tree data_ref_base_var; tree new_base_stmt; tree vec_stmt; tree addr_base, addr_expr; *************** vect_create_addr_base_for_vector_ref (tr *** 722,733 **** tree base_offset = unshare_expr (DR_OFFSET (dr)); tree init = unshare_expr (DR_INIT (dr)); tree vect_ptr_type, addr_expr2; ! ! ! /* Create data_ref_base */ ! data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base_expr), "batmp"); add_referenced_var (data_ref_base_var); ! data_ref_base = force_gimple_operand (data_ref_base_expr, &new_base_stmt, true, data_ref_base_var); append_to_statement_list_force(new_base_stmt, new_stmt_list); --- 760,785 ---- tree base_offset = unshare_expr (DR_OFFSET (dr)); tree init = unshare_expr (DR_INIT (dr)); tree vect_ptr_type, addr_expr2; ! tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); ! ! gcc_assert (loop); ! if (loop != containing_loop) ! { ! loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); ! struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ! ! gcc_assert (nested_in_vect_loop_p (loop, stmt)); ! ! data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); ! base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); ! init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); ! } ! ! /* Create base_offset */ ! base_name = build_fold_indirect_ref (data_ref_base); ! data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp"); add_referenced_var (data_ref_base_var); ! data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt, true, data_ref_base_var); append_to_statement_list_force(new_base_stmt, new_stmt_list); *************** vect_create_addr_base_for_vector_ref (tr *** 742,757 **** if (offset) { tree tmp = create_tmp_var (sizetype, "offset"); - tree step; - - /* For interleaved access step we divide STEP by the size of the - interleaving group. */ - if (DR_GROUP_SIZE (stmt_info)) - step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr), - build_int_cst (TREE_TYPE (offset), - DR_GROUP_SIZE (stmt_info))); - else - step = DR_STEP (dr); add_referenced_var (tmp); offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step); --- 794,799 ---- *************** vect_create_addr_base_for_vector_ref (tr *** 800,806 **** 1. STMT: a stmt that references memory. Expected to be of the form GIMPLE_MODIFY_STMT or GIMPLE_MODIFY_STMT . ! 2. BSI: block_stmt_iterator where new stmts can be added. 3. OFFSET (optional): an offset to be added to the initial address accessed by the data-ref in STMT. 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain --- 842,848 ---- 1. STMT: a stmt that references memory. Expected to be of the form GIMPLE_MODIFY_STMT or GIMPLE_MODIFY_STMT . ! 2. AT_LOOP: the loop where the vector memref is to be created. 3. OFFSET (optional): an offset to be added to the initial address accessed by the data-ref in STMT. 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain *************** vect_create_addr_base_for_vector_ref (tr *** 827,844 **** Return the increment stmt that updates the pointer in PTR_INCR. ! 3. Return the pointer. */ static tree ! vect_create_data_ref_ptr (tree stmt, ! block_stmt_iterator *bsi ATTRIBUTE_UNUSED, tree offset, tree *initial_address, tree *ptr_incr, ! bool only_init, tree type) { tree base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree vect_ptr_type; tree vect_ptr; --- 869,890 ---- Return the increment stmt that updates the pointer in PTR_INCR. ! 3. Set INV_P to true if the access pattern of the data reference in the ! vectorized loop is invariant. Set it to false otherwise. ! ! 4. Return the pointer. */ static tree ! vect_create_data_ref_ptr (tree stmt, struct loop *at_loop, tree offset, tree *initial_address, tree *ptr_incr, ! bool only_init, tree type, bool *inv_p) { tree base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree vect_ptr_type; tree vect_ptr; *************** vect_create_data_ref_ptr (tree stmt, *** 846,856 **** tree new_temp; tree vec_stmt; tree new_stmt_list = NULL_TREE; ! edge pe = loop_preheader_edge (loop); basic_block new_bb; tree vect_ptr_init; struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr))); if (vect_print_dump_info (REPORT_DETAILS)) --- 892,922 ---- tree new_temp; tree vec_stmt; tree new_stmt_list = NULL_TREE; ! edge pe; basic_block new_bb; tree vect_ptr_init; struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree vptr; + block_stmt_iterator incr_bsi; + bool insert_after; + tree indx_before_incr, indx_after_incr; + tree incr; + tree step; + + /* Check the step (evolution) of the load in LOOP, and record + whether it's invariant. */ + if (nested_in_vect_loop) + step = STMT_VINFO_DR_STEP (stmt_info); + else + step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); + + if (tree_int_cst_compare (step, size_zero_node) == 0) + *inv_p = true; + else + *inv_p = false; + /* Create an expression for the first address accessed by this load + in LOOP. */ base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr))); if (vect_print_dump_info (REPORT_DETAILS)) *************** vect_create_data_ref_ptr (tree stmt, *** 893,904 **** var_ann (vect_ptr)->subvars = DR_SUBVARS (dr); /** (3) Calculate the initial address the vector-pointer, and set the vector-pointer to point to it before the loop: **/ /* Create: (&(base[init_val+offset]) in the loop preheader. */ new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, ! offset); pe = loop_preheader_edge (loop); new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list); gcc_assert (!new_bb); --- 959,1002 ---- var_ann (vect_ptr)->subvars = DR_SUBVARS (dr); + /** Note: If the dataref is in an inner-loop nested in LOOP, and we are + vectorizing LOOP (i.e. outer-loop vectorization), we need to create two + def-use update cycles for the pointer: One relative to the outer-loop + (LOOP), which is what steps (3) and (4) below do. The other is relative + to the inner-loop (which is the inner-most loop containing the dataref), + and this is done be step (5) below. + + When vectorizing inner-most loops, the vectorized loop (LOOP) is also the + inner-most loop, and so steps (3),(4) work the same, and step (5) is + redundant. Steps (3),(4) create the following: + + vp0 = &base_addr; + LOOP: vp1 = phi(vp0,vp2) + ... + ... + vp2 = vp1 + step + goto LOOP + + If there is an inner-loop nested in loop, then step (5) will also be + applied, and an additional update in the inner-loop will be created: + + vp0 = &base_addr; + LOOP: vp1 = phi(vp0,vp2) + ... + inner: vp3 = phi(vp1,vp4) + vp4 = vp3 + inner_step + if () goto inner + ... + vp2 = vp1 + step + if () goto LOOP */ + /** (3) Calculate the initial address the vector-pointer, and set the vector-pointer to point to it before the loop: **/ /* Create: (&(base[init_val+offset]) in the loop preheader. */ + new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, ! offset, loop); pe = loop_preheader_edge (loop); new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list); gcc_assert (!new_bb); *************** vect_create_data_ref_ptr (tree stmt, *** 913,937 **** gcc_assert (!new_bb); ! /** (4) Handle the updating of the vector-pointer inside the loop: **/ ! if (only_init) /* No update in loop is required. */ { /* Copy the points-to information if it exists. */ if (DR_PTR_INFO (dr)) duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr)); ! return vect_ptr_init; } else { ! block_stmt_iterator incr_bsi; ! bool insert_after; ! tree indx_before_incr, indx_after_incr; ! tree incr; standard_iv_increment_position (loop, &incr_bsi, &insert_after); create_iv (vect_ptr_init, ! fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)), NULL_TREE, loop, &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr); incr = bsi_stmt (incr_bsi); --- 1011,1041 ---- gcc_assert (!new_bb); ! /** (4) Handle the updating of the vector-pointer inside the loop. ! This is needed when ONLY_INIT is false, and also when AT_LOOP ! is the inner-loop nested in LOOP (during outer-loop vectorization). ! **/ ! if (only_init && at_loop == loop) /* No update in loop is required. */ { /* Copy the points-to information if it exists. */ if (DR_PTR_INFO (dr)) duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr)); ! vptr = vect_ptr_init; } else { ! /* The step of the vector pointer is the Vector Size. */ ! tree step = TYPE_SIZE_UNIT (vectype); ! /* One exception to the above is when the scalar step of the load in ! LOOP is zero. In this case the step here is also zero. */ ! if (*inv_p) ! step = size_zero_node; standard_iv_increment_position (loop, &incr_bsi, &insert_after); + create_iv (vect_ptr_init, ! fold_convert (vect_ptr_type, step), NULL_TREE, loop, &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr); incr = bsi_stmt (incr_bsi); *************** vect_create_data_ref_ptr (tree stmt, *** 949,963 **** if (ptr_incr) *ptr_incr = incr; ! return indx_before_incr; } } /* Function bump_vector_ptr ! Increment a pointer (to a vector type) by vector-size. Connect the new ! increment stmt to the existing def-use update-chain of the pointer. The pointer def-use update-chain before this function: DATAREF_PTR = phi (p_0, p_2) --- 1053,1103 ---- if (ptr_incr) *ptr_incr = incr; ! vptr = indx_before_incr; } + + if (!nested_in_vect_loop || only_init) + return vptr; + + + /** (5) Handle the updating of the vector-pointer inside the inner-loop + nested in LOOP, if exists: **/ + + gcc_assert (nested_in_vect_loop); + if (!only_init) + { + standard_iv_increment_position (containing_loop, &incr_bsi, + &insert_after); + create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE, + containing_loop, &incr_bsi, insert_after, &indx_before_incr, + &indx_after_incr); + incr = bsi_stmt (incr_bsi); + set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo)); + + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + { + duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); + duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); + } + merge_alias_info (vect_ptr_init, indx_before_incr); + merge_alias_info (vect_ptr_init, indx_after_incr); + if (ptr_incr) + *ptr_incr = incr; + + return indx_before_incr; + } + else + gcc_unreachable (); } /* Function bump_vector_ptr ! Increment a pointer (to a vector type) by vector-size. If requested, ! i.e. if PTR-INCR is given, then also connect the new increment stmt ! to the existing def-use update-chain of the pointer, by modifying ! the PTR_INCR as illustrated below: The pointer def-use update-chain before this function: DATAREF_PTR = phi (p_0, p_2) *************** vect_create_data_ref_ptr (tree stmt, *** 967,984 **** The pointer def-use update-chain after this function: DATAREF_PTR = phi (p_0, p_2) .... ! NEW_DATAREF_PTR = DATAREF_PTR + vector_size .... PTR_INCR: p_2 = NEW_DATAREF_PTR + step Input: DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated in the loop. ! PTR_INCR - the stmt that updates the pointer in each iteration of the loop. ! The increment amount across iterations is also expected to be ! vector_size. BSI - location where the new update stmt is to be placed. STMT - the original scalar memory-access stmt that is being vectorized. Output: Return NEW_DATAREF_PTR as illustrated above. --- 1107,1126 ---- The pointer def-use update-chain after this function: DATAREF_PTR = phi (p_0, p_2) .... ! NEW_DATAREF_PTR = DATAREF_PTR + BUMP .... PTR_INCR: p_2 = NEW_DATAREF_PTR + step Input: DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated in the loop. ! PTR_INCR - optional. The stmt that updates the pointer in each iteration of ! the loop. The increment amount across iterations is expected ! to be vector_size. BSI - location where the new update stmt is to be placed. STMT - the original scalar memory-access stmt that is being vectorized. + BUMP - optional. The offset by which to bump the pointer. If not given, + the offset is assumed to be vector_size. Output: Return NEW_DATAREF_PTR as illustrated above. *************** vect_create_data_ref_ptr (tree stmt, *** 986,992 **** static tree bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, ! tree stmt) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); --- 1128,1134 ---- static tree bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, ! tree stmt, tree bump) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); *************** bump_vector_ptr (tree dataref_ptr, tree *** 999,1004 **** --- 1141,1149 ---- use_operand_p use_p; tree new_dataref_ptr; + if (bump) + update = bump; + incr_stmt = build_gimple_modify_stmt (ptr_var, build2 (POINTER_PLUS_EXPR, vptr_type, dataref_ptr, update)); *************** bump_vector_ptr (tree dataref_ptr, tree *** 1006,1011 **** --- 1151,1164 ---- GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr; vect_finish_stmt_generation (stmt, incr_stmt, bsi); + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); + merge_alias_info (new_dataref_ptr, dataref_ptr); + + if (!ptr_incr) + return new_dataref_ptr; + /* Update the vector-pointer's cross-iteration increment. */ FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) { *************** bump_vector_ptr (tree dataref_ptr, tree *** 1017,1027 **** gcc_assert (tree_int_cst_compare (use, update) == 0); } - /* Copy the points-to information if it exists. */ - if (DR_PTR_INFO (dr)) - duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); - merge_alias_info (new_dataref_ptr, dataref_ptr); - return new_dataref_ptr; } --- 1170,1175 ---- *************** vect_create_destination_var (tree scalar *** 1056,1070 **** /* Function vect_init_vector. Insert a new stmt (INIT_STMT) that initializes a new vector variable with ! the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be ! used in the vectorization of STMT. */ static tree ! vect_init_vector (tree stmt, tree vector_var, tree vector_type) { stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree new_var; tree init_stmt; tree vec_oprnd; --- 1204,1219 ---- /* Function vect_init_vector. Insert a new stmt (INIT_STMT) that initializes a new vector variable with ! the vector elements of VECTOR_VAR. Place the initialization at BSI if it ! is not NULL. Otherwise, place the initialization at the loop preheader. ! Return the DEF of INIT_STMT. ! It will be used in the vectorization of STMT. */ static tree ! vect_init_vector (tree stmt, tree vector_var, tree vector_type, ! block_stmt_iterator *bsi) { stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); tree new_var; tree init_stmt; tree vec_oprnd; *************** vect_init_vector (tree stmt, tree vector *** 1074,1087 **** new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_"); add_referenced_var (new_var); - init_stmt = build_gimple_modify_stmt (new_var, vector_var); new_temp = make_ssa_name (new_var, init_stmt); GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp; ! pe = loop_preheader_edge (loop); ! new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); ! gcc_assert (!new_bb); if (vect_print_dump_info (REPORT_DETAILS)) { --- 1223,1245 ---- new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_"); add_referenced_var (new_var); init_stmt = build_gimple_modify_stmt (new_var, vector_var); new_temp = make_ssa_name (new_var, init_stmt); GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp; ! if (bsi) ! vect_finish_stmt_generation (stmt, init_stmt, bsi); ! else ! { ! loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); ! struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ! ! if (nested_in_vect_loop_p (loop, stmt)) ! loop = loop->inner; ! pe = loop_preheader_edge (loop); ! new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); ! gcc_assert (!new_bb); ! } if (vect_print_dump_info (REPORT_DETAILS)) { *************** vect_init_vector (tree stmt, tree vector *** 1097,1102 **** --- 1255,1261 ---- /* Function get_initial_def_for_induction Input: + STMT - a stmt that performs an induction operation in the loop. IV_PHI - the initial value of the induction variable Output: *************** get_initial_def_for_induction (tree iv_p *** 1115,1122 **** tree vectype = get_vectype_for_scalar_type (scalar_type); int nunits = TYPE_VECTOR_SUBPARTS (vectype); edge pe = loop_preheader_edge (loop); basic_block new_bb; - block_stmt_iterator bsi; tree vec, vec_init, vec_step, t; tree access_fn; tree new_var; --- 1274,1281 ---- tree vectype = get_vectype_for_scalar_type (scalar_type); int nunits = TYPE_VECTOR_SUBPARTS (vectype); edge pe = loop_preheader_edge (loop); + struct loop *iv_loop; basic_block new_bb; tree vec, vec_init, vec_step, t; tree access_fn; tree new_var; *************** get_initial_def_for_induction (tree iv_p *** 1130,1137 **** int ncopies = vf / nunits; tree expr; stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); tree stmts; ! tree stmt = NULL_TREE; block_stmt_iterator si; basic_block bb = bb_for_stmt (iv_phi); --- 1289,1301 ---- int ncopies = vf / nunits; tree expr; stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); + bool nested_in_vect_loop = false; tree stmts; ! imm_use_iterator imm_iter; ! use_operand_p use_p; ! tree exit_phi; ! edge latch_e; ! tree loop_arg; block_stmt_iterator si; basic_block bb = bb_for_stmt (iv_phi); *************** get_initial_def_for_induction (tree iv_p *** 1140,1204 **** /* Find the first insertion point in the BB. */ si = bsi_after_labels (bb); - stmt = bsi_stmt (si); ! access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi)); gcc_assert (access_fn); ! ok = vect_is_simple_iv_evolution (loop->num, access_fn, ! &init_expr, &step_expr); gcc_assert (ok); /* Create the vector that holds the initial_value of the induction. */ ! new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); ! add_referenced_var (new_var); ! ! new_name = force_gimple_operand (init_expr, &stmts, false, new_var); ! if (stmts) { ! new_bb = bsi_insert_on_edge_immediate (pe, stmts); ! gcc_assert (!new_bb); } ! ! t = NULL_TREE; ! t = tree_cons (NULL_TREE, new_name, t); ! for (i = 1; i < nunits; i++) { ! tree tmp; ! /* Create: new_name = new_name + step_expr */ ! tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr); ! init_stmt = build_gimple_modify_stmt (new_var, tmp); ! new_name = make_ssa_name (new_var, init_stmt); ! GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name; ! new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); ! gcc_assert (!new_bb); ! if (vect_print_dump_info (REPORT_DETAILS)) ! { ! fprintf (vect_dump, "created new init_stmt: "); ! print_generic_expr (vect_dump, init_stmt, TDF_SLIM); ! } ! t = tree_cons (NULL_TREE, new_name, t); } - vec = build_constructor_from_list (vectype, nreverse (t)); - vec_init = vect_init_vector (stmt, vec, vectype); /* Create the vector that holds the step of the induction. */ ! expr = build_int_cst (scalar_type, vf); ! new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); t = NULL_TREE; for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); ! vec_step = vect_init_vector (stmt, vec, vectype); /* Create the following def-use cycle: loop prolog: ! vec_init = [X, X+S, X+2*S, X+3*S] ! vec_step = [VF*S, VF*S, VF*S, VF*S] loop: vec_iv = PHI ... --- 1304,1410 ---- /* Find the first insertion point in the BB. */ si = bsi_after_labels (bb); ! if (INTEGRAL_TYPE_P (scalar_type)) ! step_expr = build_int_cst (scalar_type, 0); ! else ! step_expr = build_real (scalar_type, dconst0); ! ! /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ ! if (nested_in_vect_loop_p (loop, iv_phi)) ! { ! nested_in_vect_loop = true; ! iv_loop = loop->inner; ! } ! else ! iv_loop = loop; ! gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father); ! ! latch_e = loop_latch_edge (iv_loop); ! loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); ! ! access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); gcc_assert (access_fn); ! ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, ! &init_expr, &step_expr); gcc_assert (ok); + pe = loop_preheader_edge (iv_loop); /* Create the vector that holds the initial_value of the induction. */ ! if (nested_in_vect_loop) { ! /* iv_loop is nested in the loop to be vectorized. init_expr had already ! been created during vectorization of previous stmts; We obtain it from ! the STMT_VINFO_VEC_STMT of the defining stmt. */ ! tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop)); ! vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); } ! else { ! /* iv_loop is the loop to be vectorized. Create: ! vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ ! new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); ! add_referenced_var (new_var); ! new_name = force_gimple_operand (init_expr, &stmts, false, new_var); ! if (stmts) ! { ! new_bb = bsi_insert_on_edge_immediate (pe, stmts); ! gcc_assert (!new_bb); ! } ! t = NULL_TREE; ! t = tree_cons (NULL_TREE, init_expr, t); ! for (i = 1; i < nunits; i++) ! { ! tree tmp; ! /* Create: new_name_i = new_name + step_expr */ ! tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr); ! init_stmt = build_gimple_modify_stmt (new_var, tmp); ! new_name = make_ssa_name (new_var, init_stmt); ! GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name; ! ! new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); ! gcc_assert (!new_bb); ! ! if (vect_print_dump_info (REPORT_DETAILS)) ! { ! fprintf (vect_dump, "created new init_stmt: "); ! print_generic_expr (vect_dump, init_stmt, TDF_SLIM); ! } ! t = tree_cons (NULL_TREE, new_name, t); ! } ! /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ ! vec = build_constructor_from_list (vectype, nreverse (t)); ! vec_init = vect_init_vector (iv_phi, vec, vectype, NULL); } /* Create the vector that holds the step of the induction. */ ! if (nested_in_vect_loop) ! /* iv_loop is nested in the loop to be vectorized. Generate: ! vec_step = [S, S, S, S] */ ! new_name = step_expr; ! else ! { ! /* iv_loop is the loop to be vectorized. Generate: ! vec_step = [VF*S, VF*S, VF*S, VF*S] */ ! expr = build_int_cst (scalar_type, vf); ! new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); ! } ! t = NULL_TREE; for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); ! vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); /* Create the following def-use cycle: loop prolog: ! vec_init = ... ! vec_step = ... loop: vec_iv = PHI ... *************** get_initial_def_for_induction (tree iv_p *** 1209,1215 **** /* Create the induction-phi that defines the induction-operand. */ vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); add_referenced_var (vec_dest); ! induction_phi = create_phi_node (vec_dest, loop->header); set_stmt_info (get_stmt_ann (induction_phi), new_stmt_vec_info (induction_phi, loop_vinfo)); induc_def = PHI_RESULT (induction_phi); --- 1415,1421 ---- /* Create the induction-phi that defines the induction-operand. */ vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); add_referenced_var (vec_dest); ! induction_phi = create_phi_node (vec_dest, iv_loop->header); set_stmt_info (get_stmt_ann (induction_phi), new_stmt_vec_info (induction_phi, loop_vinfo)); induc_def = PHI_RESULT (induction_phi); *************** get_initial_def_for_induction (tree iv_p *** 1220,1234 **** induc_def, vec_step)); vec_def = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; ! bsi = bsi_for_stmt (stmt); ! vect_finish_stmt_generation (stmt, new_stmt, &bsi); /* Set the arguments of the phi node: */ ! add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop)); ! add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop)); ! /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate more than one vector stmt - i.e - we need to "unroll" the vector stmt by a factor VF/nunits. For more details see documentation --- 1426,1441 ---- induc_def, vec_step)); vec_def = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; ! bsi_insert_before (&si, new_stmt, BSI_SAME_STMT); ! set_stmt_info (get_stmt_ann (new_stmt), ! new_stmt_vec_info (new_stmt, loop_vinfo)); /* Set the arguments of the phi node: */ ! add_phi_arg (induction_phi, vec_init, pe); ! add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop)); ! /* In case that vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate more than one vector stmt - i.e - we need to "unroll" the vector stmt by a factor VF/nunits. For more details see documentation *************** get_initial_def_for_induction (tree iv_p *** 1237,1242 **** --- 1444,1451 ---- if (ncopies > 1) { stmt_vec_info prev_stmt_vinfo; + /* FORNOW. This restriction should be relaxed. */ + gcc_assert (!nested_in_vect_loop); /* Create the vector that holds the step of the induction. */ expr = build_int_cst (scalar_type, nunits); *************** get_initial_def_for_induction (tree iv_p *** 1245,1251 **** for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); ! vec_step = vect_init_vector (stmt, vec, vectype); vec_def = induc_def; prev_stmt_vinfo = vinfo_for_stmt (induction_phi); --- 1454,1460 ---- for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); ! vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); vec_def = induc_def; prev_stmt_vinfo = vinfo_for_stmt (induction_phi); *************** get_initial_def_for_induction (tree iv_p *** 1253,1271 **** { tree tmp; ! /* vec_i = vec_prev + vec_{step*nunits} */ tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step); new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp); vec_def = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; ! bsi = bsi_for_stmt (stmt); ! vect_finish_stmt_generation (stmt, new_stmt, &bsi); ! STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; prev_stmt_vinfo = vinfo_for_stmt (new_stmt); } } if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "transform induction: created def-use cycle:"); --- 1462,1511 ---- { tree tmp; ! /* vec_i = vec_prev + vec_step */ tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step); new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp); vec_def = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; ! bsi_insert_before (&si, new_stmt, BSI_SAME_STMT); ! set_stmt_info (get_stmt_ann (new_stmt), ! new_stmt_vec_info (new_stmt, loop_vinfo)); STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; prev_stmt_vinfo = vinfo_for_stmt (new_stmt); } } + if (nested_in_vect_loop) + { + /* Find the loop-closed exit-phi of the induction, and record + the final vector of induction results: */ + exit_phi = NULL; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) + { + if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p)))) + { + exit_phi = USE_STMT (use_p); + break; + } + } + if (exit_phi) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); + /* FORNOW. Currently not supporting the case that an inner-loop induction + is not used in the outer-loop (i.e. only outside the outer-loop). */ + gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) + && !STMT_VINFO_LIVE_P (stmt_vinfo)); + + STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "vector of inductions after inner-loop:"); + print_generic_expr (vect_dump, new_stmt, TDF_SLIM); + } + } + } + + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "transform induction: created def-use cycle:"); *************** vect_get_vec_def_for_operand (tree op, t *** 1301,1307 **** tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); int nunits = TYPE_VECTOR_SUBPARTS (vectype); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree vec_inv; tree vec_cst; tree t = NULL_TREE; --- 1541,1546 ---- *************** vect_get_vec_def_for_operand (tree op, t *** 1352,1358 **** vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); vec_cst = build_vector (vector_type, t); ! return vect_init_vector (stmt, vec_cst, vector_type); } /* Case 2: operand is defined outside the loop - loop invariant. */ --- 1591,1597 ---- vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); vec_cst = build_vector (vector_type, t); ! return vect_init_vector (stmt, vec_cst, vector_type, NULL); } /* Case 2: operand is defined outside the loop - loop invariant. */ *************** vect_get_vec_def_for_operand (tree op, t *** 1373,1380 **** /* FIXME: use build_constructor directly. */ vector_type = get_vectype_for_scalar_type (TREE_TYPE (def)); vec_inv = build_constructor_from_list (vector_type, t); ! ! return vect_init_vector (stmt, vec_inv, vector_type); } /* Case 3: operand is defined inside the loop. */ --- 1612,1618 ---- /* FIXME: use build_constructor directly. */ vector_type = get_vectype_for_scalar_type (TREE_TYPE (def)); vec_inv = build_constructor_from_list (vector_type, t); ! return vect_init_vector (stmt, vec_inv, vector_type, NULL); } /* Case 3: operand is defined inside the loop. */ *************** vect_get_vec_def_for_operand (tree op, t *** 1387,1400 **** def_stmt_info = vinfo_for_stmt (def_stmt); vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); gcc_assert (vec_stmt); ! vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0); return vec_oprnd; } /* Case 4: operand is defined by a loop header phi - reduction */ case vect_reduction_def: { gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); /* Get the def before the loop */ op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); --- 1625,1644 ---- def_stmt_info = vinfo_for_stmt (def_stmt); vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); gcc_assert (vec_stmt); ! if (TREE_CODE (vec_stmt) == PHI_NODE) ! vec_oprnd = PHI_RESULT (vec_stmt); ! else ! vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0); return vec_oprnd; } /* Case 4: operand is defined by a loop header phi - reduction */ case vect_reduction_def: { + struct loop *loop; + gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); + loop = (bb_for_stmt (def_stmt))->loop_father; /* Get the def before the loop */ op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); *************** vect_get_vec_def_for_operand (tree op, t *** 1406,1413 **** { gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); ! /* Get the def before the loop */ ! return get_initial_def_for_induction (def_stmt); } default: --- 1650,1661 ---- { gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); ! /* Get the def from the vectorized stmt. */ ! def_stmt_info = vinfo_for_stmt (def_stmt); ! vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); ! gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE)); ! vec_oprnd = PHI_RESULT (vec_stmt); ! return vec_oprnd; } default: *************** vect_get_vec_def_for_stmt_copy (enum vec *** 1488,1494 **** vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info); gcc_assert (vec_stmt_for_operand); vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0); - return vec_oprnd; } --- 1736,1741 ---- *************** vect_finish_stmt_generation (tree stmt, *** 1504,1510 **** --- 1751,1761 ---- stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + gcc_assert (stmt == bsi_stmt (*bsi)); + gcc_assert (TREE_CODE (stmt) != LABEL_EXPR); + bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); + set_stmt_info (get_stmt_ann (vec_stmt), new_stmt_vec_info (vec_stmt, loop_vinfo)); *************** static tree *** 1572,1577 **** --- 1823,1830 ---- get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def) { stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); int nunits = TYPE_VECTOR_SUBPARTS (vectype); enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)); *************** get_initial_def_for_reduction (tree stmt *** 1582,1589 **** --- 1835,1848 ---- tree t = NULL_TREE; int i; tree vector_type; + bool nested_in_vect_loop = false; gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)); + if (nested_in_vect_loop_p (loop, stmt)) + nested_in_vect_loop = true; + else + gcc_assert (loop == (bb_for_stmt (stmt))->loop_father); + vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL); switch (code) *************** get_initial_def_for_reduction (tree stmt *** 1591,1597 **** case WIDEN_SUM_EXPR: case DOT_PROD_EXPR: case PLUS_EXPR: ! *adjustment_def = init_val; /* Create a vector of zeros for init_def. */ if (INTEGRAL_TYPE_P (type)) def_for_init = build_int_cst (type, 0); --- 1850,1859 ---- case WIDEN_SUM_EXPR: case DOT_PROD_EXPR: case PLUS_EXPR: ! if (nested_in_vect_loop) ! *adjustment_def = vecdef; ! else ! *adjustment_def = init_val; /* Create a vector of zeros for init_def. */ if (INTEGRAL_TYPE_P (type)) def_for_init = build_int_cst (type, 0); *************** vect_create_epilog_for_reduction (tree v *** 1680,1703 **** tree new_phi; block_stmt_iterator exit_bsi; tree vec_dest; ! tree new_temp; tree new_name; ! tree epilog_stmt; ! tree new_scalar_dest, exit_phi; tree bitsize, bitpos, bytesize; enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)); ! tree scalar_initial_def; tree vec_initial_def; tree orig_name; imm_use_iterator imm_iter; use_operand_p use_p; ! bool extract_scalar_result; ! tree reduction_op; tree orig_stmt; tree use_stmt; tree operation = GIMPLE_STMT_OPERAND (stmt, 1); int op_type; op_type = TREE_OPERAND_LENGTH (operation); reduction_op = TREE_OPERAND (operation, op_type-1); vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); --- 1942,1972 ---- tree new_phi; block_stmt_iterator exit_bsi; tree vec_dest; ! tree new_temp = NULL_TREE; tree new_name; ! tree epilog_stmt = NULL_TREE; ! tree new_scalar_dest, exit_phi, new_dest; tree bitsize, bitpos, bytesize; enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)); ! tree adjustment_def; tree vec_initial_def; tree orig_name; imm_use_iterator imm_iter; use_operand_p use_p; ! bool extract_scalar_result = false; ! tree reduction_op, expr; tree orig_stmt; tree use_stmt; tree operation = GIMPLE_STMT_OPERAND (stmt, 1); + bool nested_in_vect_loop = false; int op_type; + if (nested_in_vect_loop_p (loop, stmt)) + { + loop = loop->inner; + nested_in_vect_loop = true; + } + op_type = TREE_OPERAND_LENGTH (operation); reduction_op = TREE_OPERAND (operation, op_type-1); vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); *************** vect_create_epilog_for_reduction (tree v *** 1710,1716 **** the scalar def before the loop, that defines the initial value of the reduction variable. */ vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, ! &scalar_initial_def); add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); /* 1.2 set the loop-latch arg for the reduction-phi: */ --- 1979,1985 ---- the scalar def before the loop, that defines the initial value of the reduction variable. */ vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, ! &adjustment_def); add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); /* 1.2 set the loop-latch arg for the reduction-phi: */ *************** vect_create_epilog_for_reduction (tree v *** 1789,1794 **** --- 2058,2072 ---- bitsize = TYPE_SIZE (scalar_type); bytesize = TYPE_SIZE_UNIT (scalar_type); + + /* In case this is a reduction in an inner-loop while vectorizing an outer + loop - we don't need to extract a single scalar result at the end of the + inner-loop. The final vector of partial results will be used in the + vectorized outer-loop, or reduced to a scalar result at the end of the + outer-loop. */ + if (nested_in_vect_loop) + goto vect_finalize_reduction; + /* 2.3 Create the reduction code, using one of the three schemes described above. */ *************** vect_create_epilog_for_reduction (tree v *** 1935,1940 **** --- 2213,2219 ---- { tree rhs; + gcc_assert (!nested_in_vect_loop); if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "extract scalar result"); *************** vect_create_epilog_for_reduction (tree v *** 1953,1977 **** bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); } ! /* 2.4 Adjust the final result by the initial value of the reduction variable. (When such adjustment is not needed, then ! 'scalar_initial_def' is zero). ! Create: ! s_out4 = scalar_expr */ ! ! if (scalar_initial_def) { ! tree tmp = build2 (code, scalar_type, new_temp, scalar_initial_def); ! epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp); ! new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); } - /* 2.6 Replace uses of s_out0 with uses of s_out3 */ ! /* Find the loop-closed-use at the loop exit of the original scalar result. (The reduction result is expected to have two immediate uses - one at the latch block, and one at the loop exit). */ exit_phi = NULL; --- 2232,2273 ---- bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); } ! vect_finalize_reduction: ! ! /* 2.5 Adjust the final result by the initial value of the reduction variable. (When such adjustment is not needed, then ! 'adjustment_def' is zero). For example, if code is PLUS we create: ! new_temp = loop_exit_def + adjustment_def */ ! if (adjustment_def) { ! if (nested_in_vect_loop) ! { ! gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); ! expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); ! new_dest = vect_create_destination_var (scalar_dest, vectype); ! } ! else ! { ! gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); ! expr = build2 (code, scalar_type, new_temp, adjustment_def); ! new_dest = vect_create_destination_var (scalar_dest, scalar_type); ! } ! epilog_stmt = build_gimple_modify_stmt (new_dest, expr); ! new_temp = make_ssa_name (new_dest, epilog_stmt); GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; + #if 0 + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + #else bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); + #endif } ! /* 2.6 Handle the loop-exit phi */ ! ! /* Replace uses of s_out0 with uses of s_out3: ! Find the loop-closed-use at the loop exit of the original scalar result. (The reduction result is expected to have two immediate uses - one at the latch block, and one at the loop exit). */ exit_phi = NULL; *************** vect_create_epilog_for_reduction (tree v *** 1985,1990 **** --- 2281,2309 ---- } /* We expect to have found an exit_phi because of loop-closed-ssa form. */ gcc_assert (exit_phi); + + if (nested_in_vect_loop) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); + + /* FORNOW. Currently not supporting the case that an inner-loop reduction + is not used in the outer-loop (but only outside the outer-loop). */ + gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) + && !STMT_VINFO_LIVE_P (stmt_vinfo)); + + epilog_stmt = adjustment_def ? epilog_stmt : new_phi; + STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; + set_stmt_info (get_stmt_ann (epilog_stmt), + new_stmt_vec_info (epilog_stmt, loop_vinfo)); + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "vector of partial results after inner-loop:"); + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + } + return; + } + /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) *************** vectorizable_reduction (tree stmt, block *** 2066,2080 **** tree new_stmt = NULL_TREE; int j; gcc_assert (ncopies >= 1); /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ ! if (STMT_VINFO_RELEVANT_P (stmt_info)) return false; ! if (!STMT_VINFO_LIVE_P (stmt_info)) return false; /* Make sure it was already recognized as a reduction computation. */ --- 2385,2414 ---- tree new_stmt = NULL_TREE; int j; + if (nested_in_vect_loop_p (loop, stmt)) + { + loop = loop->inner; + /* FORNOW. This restriction should be relaxed. */ + if (ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + } + gcc_assert (ncopies >= 1); /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ ! if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) return false; ! /* Reductions that are not used even in an enclosing outer-loop, ! are expected to be "live" (used out of the loop). */ ! if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop ! && !STMT_VINFO_LIVE_P (stmt_info)) return false; /* Make sure it was already recognized as a reduction computation. */ *************** vectorizable_reduction (tree stmt, block *** 2131,2139 **** gcc_assert (dt == vect_reduction_def); gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); if (orig_stmt) ! gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt)); else ! gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt)); if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) return false; --- 2465,2473 ---- gcc_assert (dt == vect_reduction_def); gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); if (orig_stmt) ! gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); else ! gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) return false; *************** vectorizable_call (tree stmt, block_stmt *** 2358,2363 **** --- 2692,2698 ---- int nunits_in; int nunits_out; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type; enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; tree new_stmt; *************** vectorizable_call (tree stmt, block_stmt *** 2467,2472 **** --- 2802,2815 ---- needs to be generated. */ gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; *************** vectorizable_call (tree stmt, block_stmt *** 2481,2486 **** --- 2824,2837 ---- if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform operation."); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + /* Handle def. */ scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); vec_dest = vect_create_destination_var (scalar_dest, vectype_out); *************** vectorizable_conversion (tree stmt, bloc *** 2672,2677 **** --- 3023,3029 ---- tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; tree decl1 = NULL_TREE, decl2 = NULL_TREE; tree new_temp; *************** vectorizable_conversion (tree stmt, bloc *** 2753,2758 **** --- 3105,3118 ---- needs to be generated. */ gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + /* Check the operands of the operation. */ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0)) { *************** vectorizable_operation (tree stmt, block *** 3094,3099 **** --- 3454,3460 ---- stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code; enum machine_mode vec_mode; tree new_temp; *************** vectorizable_operation (tree stmt, block *** 3112,3117 **** --- 3473,3485 ---- int j; gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; *************** vectorizable_type_demotion (tree stmt, b *** 3374,3379 **** --- 3742,3748 ---- tree vec_oprnd0=NULL, vec_oprnd1=NULL; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code, code1 = ERROR_MARK; tree new_temp; tree def, def_stmt; *************** vectorizable_type_demotion (tree stmt, b *** 3426,3431 **** --- 3795,3807 ---- ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) && INTEGRAL_TYPE_P (TREE_TYPE (op0))) *************** vectorizable_type_promotion (tree stmt, *** 3523,3528 **** --- 3899,3905 ---- tree vec_oprnd0=NULL, vec_oprnd1=NULL; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; tree decl1 = NULL_TREE, decl2 = NULL_TREE; int op_type; *************** vectorizable_type_promotion (tree stmt, *** 3576,3581 **** --- 3953,3965 ---- ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) && INTEGRAL_TYPE_P (TREE_TYPE (op0))) *************** vectorizable_store (tree stmt, block_stm *** 3868,3876 **** struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL; tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); enum machine_mode vec_mode; tree dummy; ! enum dr_alignment_support alignment_support_cheme; ssa_op_iter iter; def_operand_p def_p; tree def, def_stmt; --- 4252,4261 ---- struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL; tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum machine_mode vec_mode; tree dummy; ! enum dr_alignment_support alignment_support_scheme; ssa_op_iter iter; def_operand_p def_p; tree def, def_stmt; *************** vectorizable_store (tree stmt, block_stm *** 3884,3891 **** --- 4269,4286 ---- bool strided_store = false; unsigned int group_size, i; VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL; + bool inv_p; + gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } + if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; *************** vectorizable_store (tree stmt, block_stm *** 3951,3956 **** --- 4346,4354 ---- DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++; + /* FORNOW */ + gcc_assert (!nested_in_vect_loop_p (loop, stmt)); + /* We vectorize all the stmts of the interleaving group when we reach the last stmt in the group. */ if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt)) *************** vectorizable_store (tree stmt, block_stm *** 3973,3981 **** dr_chain = VEC_alloc (tree, heap, group_size); oprnds = VEC_alloc (tree, heap, group_size); ! alignment_support_cheme = vect_supportable_dr_alignment (first_dr); ! gcc_assert (alignment_support_cheme); ! gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */ /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate --- 4371,4379 ---- dr_chain = VEC_alloc (tree, heap, group_size); oprnds = VEC_alloc (tree, heap, group_size); ! alignment_support_scheme = vect_supportable_dr_alignment (first_dr); ! gcc_assert (alignment_support_scheme); ! gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */ /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate *************** vectorizable_store (tree stmt, block_stm *** 4045,4053 **** VEC_quick_push(tree, oprnds, vec_oprnd); next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); } ! dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE, &dummy, &ptr_incr, false, ! TREE_TYPE (vec_oprnd)); } else { --- 4443,4452 ---- VEC_quick_push(tree, oprnds, vec_oprnd); next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); } ! dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE, &dummy, &ptr_incr, false, ! TREE_TYPE (vec_oprnd), &inv_p); ! gcc_assert (!inv_p); } else { *************** vectorizable_store (tree stmt, block_stm *** 4065,4071 **** VEC_replace(tree, dr_chain, i, vec_oprnd); VEC_replace(tree, oprnds, i, vec_oprnd); } ! dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); } if (strided_store) --- 4464,4471 ---- VEC_replace(tree, dr_chain, i, vec_oprnd); VEC_replace(tree, oprnds, i, vec_oprnd); } ! dataref_ptr = ! bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } if (strided_store) *************** vectorizable_store (tree stmt, block_stm *** 4125,4131 **** if (!next_stmt) break; /* Bump the vector pointer. */ ! dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); } } --- 4525,4532 ---- if (!next_stmt) break; /* Bump the vector pointer. */ ! dataref_ptr = ! bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } } *************** vectorizable_store (tree stmt, block_stm *** 4136,4149 **** /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using ! the dr_unaligned_software_pipeline scheme. This function generates the following code at the loop prolog: p = initial_addr; ! msq_init = *(floor(p)); # prolog load realignment_token = call target_builtin; loop: ! msq = phi (msq_init, ---) The code above sets up a new (vector) pointer, pointing to the first location accessed by STMT, and a "floor-aligned" load using that pointer. --- 4537,4553 ---- /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using ! the dr_explicit_realign[_optimized] scheme. This function generates the following code at the loop prolog: p = initial_addr; ! x msq_init = *(floor(p)); # prolog load realignment_token = call target_builtin; loop: ! x msq = phi (msq_init, ---) ! ! The stmts marked with x are generated only for the case of ! dr_explicit_realign_optimized. The code above sets up a new (vector) pointer, pointing to the first location accessed by STMT, and a "floor-aligned" load using that pointer. *************** vectorizable_store (tree stmt, block_stm *** 4152,4170 **** whose arguments are the result of the prolog-load (created by this function) and the result of a load that takes place in the loop (to be created by the caller to this function). The caller to this function uses the phi-result (msq) to create the realignment code inside the loop, and sets up the missing phi argument, as follows: - loop: msq = phi (msq_init, lsq) lsq = *(floor(p')); # load in loop result = realign_load (msq, lsq, realignment_token); Input: STMT - (scalar) load stmt to be vectorized. This load accesses a memory location that may be unaligned. BSI - place where new code is to be inserted. Output: REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load --- 4556,4584 ---- whose arguments are the result of the prolog-load (created by this function) and the result of a load that takes place in the loop (to be created by the caller to this function). + + For the case of dr_explicit_realign_optimizedr: The caller to this function uses the phi-result (msq) to create the realignment code inside the loop, and sets up the missing phi argument, as follows: loop: msq = phi (msq_init, lsq) lsq = *(floor(p')); # load in loop result = realign_load (msq, lsq, realignment_token); + For the case of dr_explicit_realign: + loop: + msq = *(floor(p)); # load in loop + p' = p + (VS-1); + lsq = *(floor(p')); # load in loop + result = realign_load (msq, lsq, realignment_token); + Input: STMT - (scalar) load stmt to be vectorized. This load accesses a memory location that may be unaligned. BSI - place where new code is to be inserted. + ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes + is used. Output: REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load *************** vectorizable_store (tree stmt, block_stm *** 4173,4217 **** static tree vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, ! tree *realignment_token) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ! edge pe = loop_preheader_edge (loop); tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); tree vec_dest; - tree init_addr; tree inc; tree ptr; tree data_ref; tree new_stmt; basic_block new_bb; ! tree msq_init; tree new_temp; tree phi_stmt; ! tree msq; ! /* 1. Create msq_init = *(floor(p1)) in the loop preheader */ ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true, ! NULL_TREE); ! data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); ! new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; ! new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); ! gcc_assert (!new_bb); ! msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0); ! copy_virtual_operands (new_stmt, stmt); ! update_vuses_to_preheader (new_stmt, loop); - /* 2. Create permutation mask, if required, in loop preheader. */ if (targetm.vectorize.builtin_mask_for_load) { tree builtin_decl; builtin_decl = targetm.vectorize.builtin_mask_for_load (); new_stmt = build_call_expr (builtin_decl, 1, init_addr); vec_dest = vect_create_destination_var (scalar_dest, --- 4587,4733 ---- static tree vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, ! tree *realignment_token, ! enum dr_alignment_support alignment_support_scheme, ! tree init_addr, ! struct loop **at_loop) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ! edge pe; tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); tree vec_dest; tree inc; tree ptr; tree data_ref; tree new_stmt; basic_block new_bb; ! tree msq_init = NULL_TREE; tree new_temp; tree phi_stmt; ! tree msq = NULL_TREE; ! tree stmts = NULL_TREE; ! bool inv_p; ! bool compute_in_loop = false; ! bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); ! struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; ! struct loop *loop_for_initial_load; ! ! gcc_assert (alignment_support_scheme == dr_explicit_realign ! || alignment_support_scheme == dr_explicit_realign_optimized); ! ! /* We need to generate three things: ! 1. the misalignment computation ! 2. the extra vector load (for the optimized realignment scheme). ! 3. the phi node for the two vectors from which the realignment is ! done (for the optimized realignment scheme). ! */ ! ! /* 1. Determine where to generate the misalignment computation. ! ! If INIT_ADDR is NULL_TREE, this indicates that the misalignment ! calculation will be generated by this function, outside the loop (in the ! preheader). Otherwise, INIT_ADDR had already been computed for us by the ! caller, inside the loop. ! ! Background: If the misalignment remains fixed throughout the iterations of ! the loop, then both realignment schemes are applicable, and also the ! misalignment computation can be done outside LOOP. This is because we are ! vectorizing LOOP, and so the memory accesses in LOOP advance in steps that ! are a multiple of VS (the Vector Size), and therefore the misalignment in ! different vectorized LOOP iterations is always the same. ! The problem arises only if the memory access is in an inner-loop nested ! inside LOOP, which is now being vectorized using outer-loop vectorization. ! This is the only case when the misalignment of the memory access may not ! remain fixed thtoughout the iterations of the inner-loop (as exaplained in ! detail in vect_supportable_dr_alignment). In this case, not only is the ! optimized realignment scheme not applicable, but also the misalignment ! computation (and generation of the realignment token that is passed to ! REALIGN_LOAD) have to be done inside the loop. ! ! In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode ! or not, which in turn determines if the misalignment is computed inside ! the inner-loop, or outside LOOP. */ ! ! if (init_addr != NULL_TREE) ! { ! compute_in_loop = true; ! gcc_assert (alignment_support_scheme == dr_explicit_realign); ! } ! ! ! /* 2. Determine where to generate the extra vector load. ! ! For the optimized realignment scheme, instead of generating two vector ! loads in each iteration, we generate a single extra vector load in the ! preheader of the loop, and in each iteration reuse the result of the ! vector load from the previous iteration. In case the memory access is in ! an inner-loop nested inside LOOP, which is now being vectorized using ! outer-loop vectorization, we need to determine whether this initial vector ! load should be generated at the preheader of the inner-loop, or can be ! generated at the preheader of LOOP. If the memory access has no evolution ! in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has ! to be generated inside LOOP (in the preheader of the inner-loop). */ ! ! if (nested_in_vect_loop) ! { ! tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); ! bool invariant_in_outerloop = ! (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); ! loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); ! } ! else ! loop_for_initial_load = loop; ! if (at_loop) ! *at_loop = loop_for_initial_load; ! /* 3. For the case of the optimized realignment, create the first vector ! load at the loop preheader. */ ! ! if (alignment_support_scheme == dr_explicit_realign_optimized) ! { ! /* Create msq_init = *(floor(p1)) in the loop preheader */ ! ! gcc_assert (!compute_in_loop); ! pe = loop_preheader_edge (loop_for_initial_load); ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE, ! &init_addr, &inc, true, NULL_TREE, &inv_p); ! data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); ! new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; ! new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); ! gcc_assert (!new_bb); ! msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0); ! copy_virtual_operands (new_stmt, stmt); ! update_vuses_to_preheader (new_stmt, loop_for_initial_load); ! } ! ! ! /* 4. Create realignment token using a target builtin, if available. ! It is done either inside the containing loop, or before LOOP (as ! determined above). */ if (targetm.vectorize.builtin_mask_for_load) { tree builtin_decl; + /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ + if (compute_in_loop) + gcc_assert (init_addr); /* already computed by the caller. */ + else + { + /* Generate the INIT_ADDR computation outside LOOP. */ + init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, + NULL_TREE, loop); + pe = loop_preheader_edge (loop); + new_bb = bsi_insert_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + builtin_decl = targetm.vectorize.builtin_mask_for_load (); new_stmt = build_call_expr (builtin_decl, 1, init_addr); vec_dest = vect_create_destination_var (scalar_dest, *************** vect_setup_realignment (tree stmt, block *** 4219,4226 **** new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; ! new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); ! gcc_assert (!new_bb); *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0); /* The result of the CALL_EXPR to this builtin is determined from --- 4735,4751 ---- new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; ! ! if (compute_in_loop) ! bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT); ! else ! { ! /* Generate the misalignment computation outside LOOP. */ ! pe = loop_preheader_edge (loop); ! new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); ! gcc_assert (!new_bb); ! } ! *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0); /* The result of the CALL_EXPR to this builtin is determined from *************** vect_setup_realignment (tree stmt, block *** 4231,4242 **** gcc_assert (TREE_READONLY (builtin_decl)); } ! /* 3. Create msq = phi in loop */ vec_dest = vect_create_destination_var (scalar_dest, vectype); msq = make_ssa_name (vec_dest, NULL_TREE); ! phi_stmt = create_phi_node (msq, loop->header); SSA_NAME_DEF_STMT (msq) = phi_stmt; ! add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop)); return msq; } --- 4756,4776 ---- gcc_assert (TREE_READONLY (builtin_decl)); } ! if (alignment_support_scheme == dr_explicit_realign) ! return msq; ! ! gcc_assert (!compute_in_loop); ! gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); ! ! ! /* 5. Create msq = phi in loop */ ! ! pe = loop_preheader_edge (containing_loop); vec_dest = vect_create_destination_var (scalar_dest, vectype); msq = make_ssa_name (vec_dest, NULL_TREE); ! phi_stmt = create_phi_node (msq, containing_loop->header); SSA_NAME_DEF_STMT (msq) = phi_stmt; ! add_phi_arg (phi_stmt, msq_init, pe); return msq; } *************** vectorizable_load (tree stmt, block_stmt *** 4526,4538 **** stmt_vec_info prev_stmt_info; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree new_temp; int mode; tree new_stmt = NULL_TREE; tree dummy; ! enum dr_alignment_support alignment_support_cheme; tree dataref_ptr = NULL_TREE; tree ptr_incr; int nunits = TYPE_VECTOR_SUBPARTS (vectype); --- 5060,5074 ---- stmt_vec_info prev_stmt_info; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree new_temp; int mode; tree new_stmt = NULL_TREE; tree dummy; ! enum dr_alignment_support alignment_support_scheme; tree dataref_ptr = NULL_TREE; tree ptr_incr; int nunits = TYPE_VECTOR_SUBPARTS (vectype); *************** vectorizable_load (tree stmt, block_stmt *** 4541,4550 **** tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree realignment_token = NULL_TREE; ! tree phi_stmt = NULL_TREE; VEC(tree,heap) *dr_chain = NULL; bool strided_load = false; tree first_stmt; if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; --- 5077,5100 ---- tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree realignment_token = NULL_TREE; ! tree phi = NULL_TREE; VEC(tree,heap) *dr_chain = NULL; bool strided_load = false; tree first_stmt; + tree scalar_type; + bool inv_p; + bool compute_in_loop = false; + struct loop *at_loop; + + gcc_assert (ncopies >= 1); + + /* FORNOW. This restriction should be relaxed. */ + if (nested_in_vect_loop && ncopies > 1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "multiple types in nested loop."); + return false; + } if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; *************** vectorizable_load (tree stmt, block_stmt *** 4577,4582 **** --- 5127,5133 ---- if (!STMT_VINFO_DATA_REF (stmt_info)) return false; + scalar_type = TREE_TYPE (DR_REF (dr)); mode = (int) TYPE_MODE (vectype); /* FORNOW. In some cases can vectorize even if data-type not supported *************** vectorizable_load (tree stmt, block_stmt *** 4592,4597 **** --- 5143,5150 ---- if (DR_GROUP_FIRST_DR (stmt_info)) { strided_load = true; + /* FORNOW */ + gcc_assert (! nested_in_vect_loop); /* Check if interleaving is supported. */ if (!vect_strided_load_supported (vectype)) *************** vectorizable_load (tree stmt, block_stmt *** 4630,4638 **** group_size = 1; } ! alignment_support_cheme = vect_supportable_dr_alignment (first_dr); ! gcc_assert (alignment_support_cheme); ! /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate --- 5183,5190 ---- group_size = 1; } ! alignment_support_scheme = vect_supportable_dr_alignment (first_dr); ! gcc_assert (alignment_support_scheme); /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate *************** vectorizable_load (tree stmt, block_stmt *** 4714,4720 **** } Otherwise, the data reference is potentially unaligned on a target that ! does not support unaligned accesses (dr_unaligned_software_pipeline) - then generate the following code, in which the data in each iteration is obtained by two vector loads, one from the previous iteration, and one from the current iteration: --- 5266,5272 ---- } Otherwise, the data reference is potentially unaligned on a target that ! does not support unaligned accesses (dr_explicit_realign_optimized) - then generate the following code, in which the data in each iteration is obtained by two vector loads, one from the previous iteration, and one from the current iteration: *************** vectorizable_load (tree stmt, block_stmt *** 4731,4757 **** msq = lsq; } */ ! if (alignment_support_cheme == dr_unaligned_software_pipeline) ! { ! msq = vect_setup_realignment (first_stmt, bsi, &realignment_token); ! phi_stmt = SSA_NAME_DEF_STMT (msq); ! offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); } prev_stmt_info = NULL; for (j = 0; j < ncopies; j++) { /* 1. Create the vector pointer update chain. */ if (j == 0) ! dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy, ! &ptr_incr, false, NULL_TREE); else ! dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); for (i = 0; i < group_size; i++) { /* 2. Create the vector-load in the loop. */ ! switch (alignment_support_cheme) { case dr_aligned: gcc_assert (aligned_access_p (first_dr)); --- 5283,5334 ---- msq = lsq; } */ ! /* If the misalignment remains the same throughout the execution of the ! loop, we can create the init_addr and permutation mask at the loop ! preheader. Otherwise, it needs to be created inside the loop. ! This can only occur when vectorizing memory accesses in the inner-loop ! nested within an outer-loop that is being vectorized. */ ! ! if (nested_in_vect_loop_p (loop, stmt) ! && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0)) ! { ! gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized); ! compute_in_loop = true; ! } ! ! if ((alignment_support_scheme == dr_explicit_realign_optimized ! || alignment_support_scheme == dr_explicit_realign) ! && !compute_in_loop) ! { ! msq = vect_setup_realignment (first_stmt, bsi, &realignment_token, ! alignment_support_scheme, NULL_TREE, ! &at_loop); ! if (alignment_support_scheme == dr_explicit_realign_optimized) ! { ! phi = SSA_NAME_DEF_STMT (msq); ! offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); ! } } + else + at_loop = loop; prev_stmt_info = NULL; for (j = 0; j < ncopies; j++) { /* 1. Create the vector pointer update chain. */ if (j == 0) ! dataref_ptr = vect_create_data_ref_ptr (first_stmt, ! at_loop, offset, ! &dummy, &ptr_incr, false, ! NULL_TREE, &inv_p); else ! dataref_ptr = ! bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); for (i = 0; i < group_size; i++) { /* 2. Create the vector-load in the loop. */ ! switch (alignment_support_scheme) { case dr_aligned: gcc_assert (aligned_access_p (first_dr)); *************** vectorizable_load (tree stmt, block_stmt *** 4762,4775 **** int mis = DR_MISALIGNMENT (first_dr); tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); - gcc_assert (!aligned_access_p (first_dr)); tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); data_ref = build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis); break; } ! case dr_unaligned_software_pipeline: ! gcc_assert (!aligned_access_p (first_dr)); data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); break; default: --- 5339,5377 ---- int mis = DR_MISALIGNMENT (first_dr); tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); data_ref = build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis); break; } ! case dr_explicit_realign: ! { ! tree ptr, bump; ! tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); ! ! if (compute_in_loop) ! msq = vect_setup_realignment (first_stmt, bsi, ! &realignment_token, ! dr_explicit_realign, ! dataref_ptr, NULL); ! ! data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, new_stmt, bsi); ! copy_virtual_operands (new_stmt, stmt); ! mark_symbols_for_renaming (new_stmt); ! msq = new_temp; ! ! bump = size_binop (MULT_EXPR, vs_minus_1, ! TYPE_SIZE_UNIT (scalar_type)); ! ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump); ! data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); ! break; ! } ! case dr_explicit_realign_optimized: data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); break; default: *************** vectorizable_load (tree stmt, block_stmt *** 4783,4811 **** copy_virtual_operands (new_stmt, stmt); mark_symbols_for_renaming (new_stmt); ! /* 3. Handle explicit realignment if necessary/supported. */ ! if (alignment_support_cheme == dr_unaligned_software_pipeline) { - /* Create in loop: - */ lsq = GIMPLE_STMT_OPERAND (new_stmt, 0); if (!realignment_token) realignment_token = dataref_ptr; vec_dest = vect_create_destination_var (scalar_dest, vectype); ! new_stmt = ! build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token); new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, new_stmt, bsi); ! if (i == group_size - 1 && j == ncopies - 1) ! add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop)); ! msq = lsq; } if (strided_load) VEC_quick_push (tree, dr_chain, new_temp); if (i < group_size - 1) ! dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); } if (strided_load) --- 5385,5454 ---- copy_virtual_operands (new_stmt, stmt); mark_symbols_for_renaming (new_stmt); ! /* 3. Handle explicit realignment if necessary/supported. Create in ! loop: vec_dest = realign_load (msq, lsq, realignment_token) */ ! if (alignment_support_scheme == dr_explicit_realign_optimized ! || alignment_support_scheme == dr_explicit_realign) { lsq = GIMPLE_STMT_OPERAND (new_stmt, 0); if (!realignment_token) realignment_token = dataref_ptr; vec_dest = vect_create_destination_var (scalar_dest, vectype); ! new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, ! realignment_token); new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, new_stmt, bsi); ! ! if (alignment_support_scheme == dr_explicit_realign_optimized) ! { ! if (i == group_size - 1 && j == ncopies - 1) ! add_phi_arg (phi, lsq, loop_latch_edge (containing_loop)); ! msq = lsq; ! } } + + /* 4. Handle invariant-load. */ + if (inv_p) + { + gcc_assert (!strided_load); + gcc_assert (nested_in_vect_loop_p (loop, stmt)); + if (j == 0) + { + int k; + tree t = NULL_TREE; + tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type); + + /* CHECKME: bitpos depends on endianess? */ + bitpos = bitsize_zero_node; + vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp, + bitsize, bitpos); + BIT_FIELD_REF_UNSIGNED (vec_inv) = + TYPE_UNSIGNED (scalar_type); + vec_dest = + vect_create_destination_var (scalar_dest, NULL_TREE); + new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + + for (k = nunits - 1; k >= 0; --k) + t = tree_cons (NULL_TREE, new_temp, t); + /* FIXME: use build_constructor directly. */ + vec_inv = build_constructor_from_list (vectype, t); + new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi); + new_stmt = SSA_NAME_DEF_STMT (new_temp); + } + else + gcc_unreachable (); /* FORNOW; FIXME. */ + } + if (strided_load) VEC_quick_push (tree, dr_chain, new_temp); if (i < group_size - 1) ! dataref_ptr = ! bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } if (strided_load) *************** vectorizable_live_operation (tree stmt, *** 4842,4847 **** --- 5485,5491 ---- tree operation; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); int i; int op_type; tree op; *************** vectorizable_live_operation (tree stmt, *** 4859,4864 **** --- 5503,5512 ---- if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) return false; + /* FORNOW. CHECKME. */ + if (nested_in_vect_loop_p (loop, stmt)) + return false; + operation = GIMPLE_STMT_OPERAND (stmt, 1); op_type = TREE_OPERAND_LENGTH (operation); *************** vect_gen_niters_for_prolog_loop (loop_ve *** 5643,5650 **** else { tree new_stmts = NULL_TREE; ! tree start_addr = ! vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE); tree ptr_type = TREE_TYPE (start_addr); tree size = TYPE_SIZE (ptr_type); tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); --- 6291,6298 ---- else { tree new_stmts = NULL_TREE; ! tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, ! &new_stmts, NULL_TREE, loop); tree ptr_type = TREE_TYPE (start_addr); tree size = TYPE_SIZE (ptr_type); tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); *************** static tree *** 5817,5822 **** --- 6465,6471 ---- vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, tree *cond_expr_stmt_list) { + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); VEC(tree,heap) *may_misalign_stmts = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); tree ref_stmt, tmp; *************** vect_create_cond_for_align_checks (loop_ *** 5852,5859 **** /* create: addr_tmp = (int)(address_of_first_vector) */ addr_base = vect_create_addr_base_for_vector_ref (ref_stmt, ! &new_stmt_list, ! NULL_TREE); if (new_stmt_list != NULL_TREE) append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list); --- 6501,6507 ---- /* create: addr_tmp = (int)(address_of_first_vector) */ addr_base = vect_create_addr_base_for_vector_ref (ref_stmt, ! &new_stmt_list, NULL_TREE, loop); if (new_stmt_list != NULL_TREE) append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list); *************** vect_transform_loop (loop_vec_info loop_ *** 6067,6074 **** fprintf (vect_dump, "------>vectorizing statement: "); print_generic_expr (vect_dump, stmt, TDF_SLIM); } stmt_info = vinfo_for_stmt (stmt); ! gcc_assert (stmt_info); if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) { --- 6715,6732 ---- fprintf (vect_dump, "------>vectorizing statement: "); print_generic_expr (vect_dump, stmt, TDF_SLIM); } + stmt_info = vinfo_for_stmt (stmt); ! ! /* vector stmts created in the outer-loop during vectorization of ! stmts in an inner-loop may not have a stmt_info, and do not ! need to be vectorized. */ ! if (!stmt_info) ! { ! bsi_next (&si); ! continue; ! } ! if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) { *************** vect_transform_loop (loop_vec_info loop_ *** 6140,6143 **** --- 6798,6803 ---- if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) fprintf (vect_dump, "LOOP VECTORIZED."); + if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) + fprintf (vect_dump, "OUTER LOOP VECTORIZED."); }