diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 0bde7ac..5002598 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1522,6 +1522,7 @@ OBJS = \ tree-ssa-live.o \ tree-ssa-loop-ch.o \ tree-ssa-loop-im.o \ + tree-ssa-loop-interchange.o \ tree-ssa-loop-ivcanon.o \ tree-ssa-loop-ivopts.o \ tree-ssa-loop-manip.o \ diff --git a/gcc/common.opt b/gcc/common.opt index 1581ca8..5babe3f 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2530,6 +2530,10 @@ ftree-loop-distribute-patterns Common Report Var(flag_tree_loop_distribute_patterns) Optimization Enable loop distribution for patterns transformed into a library call. +ftree-loop-interchange +Common Report Var(flag_tree_loop_interchange) Optimization +Enable loop interchange on trees. + ftree-loop-im Common Report Var(flag_tree_loop_im) Init(1) Optimization Enable loop invariant motion on trees. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index e4cacf2..1825be8 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -8498,6 +8498,25 @@ ENDDO @end smallexample and the initialization loop is transformed into a call to memset zero. +@item -ftree-loop-interchange +@opindex ftree-loop-interchange +Perform loop interchange outside of graphite. This flag can improve cache +performance on loop nest and allow further loop optimizations, like +vectorization, to take place. For example, the loop +@smallexample +for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + for (int k = 0; k < N; k++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; +@end smallexample +is transformed to +@smallexample +for (int i = 0; i < N; i++) + for (int k = 0; k < N; k++) + for (int j = 0; j < N; j++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; +@end smallexample + @item -ftree-loop-im @opindex ftree-loop-im Perform loop invariant motion on trees. This pass moves only invariants that diff --git a/gcc/params.def b/gcc/params.def index 805302b..c50a1fe 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -780,6 +780,20 @@ DEFPARAM (PARAM_L2_CACHE_SIZE, "The size of L2 cache.", 512, 0, 0) +/* Maximum number of statements in loop nest for loop interchange. */ + +DEFPARAM (PARAM_LOOP_INTERCHANGE_MAX_NUM_STMTS, + "loop-interchange-max-num-stmts", + "The maximum number of stmts in loop nest for loop interchange.", + 64, 0, 0) + +/* Minimum stride ratio for loop interchange to be profitiable. */ + +DEFPARAM (PARAM_LOOP_INTERCHANGE_STRIDE_RATIO, + "loop-interchange-stride-ratio", + "The minimum stride ratio for loop interchange to be profitable", + 2, 0, 0) + /* Whether we should use canonical types rather than deep "structural" type checking. Setting this value to 1 (the default) improves compilation performance in the C++ and Objective-C++ front end; diff --git a/gcc/passes.def b/gcc/passes.def index 00e75d2..2e38c6f 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -278,6 +278,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_cd_dce); NEXT_PASS (pass_iv_canon); NEXT_PASS (pass_loop_distribution); + NEXT_PASS (pass_linterchange); NEXT_PASS (pass_copy_prop); NEXT_PASS (pass_graphite); PUSH_INSERT_PASSES_WITHIN (pass_graphite) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-1.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-1.c new file mode 100644 index 0000000..f2392e3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-1.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +/* Copied from graphite/interchange-4.c */ + +#define DEBUG 0 +#if DEBUG +#include +#endif + +double u[1782225]; + +static int __attribute__((noinline)) +foo (int N, int *res) +{ + int i, j; + double sum = 0; + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + sum = sum + u[i + 1335 * j]; + + for (i = 0; i < N; i++) + u[1336 * i] *= 2; + + *res = sum + N + u[1336 * 2] + u[1336]; +} + +extern void abort (); + +int +main (void) +{ + int i, j, res; + + for (i = 0; i < 1782225; i++) + u[i] = 2; + + foo (1335, &res); + +#if DEBUG + fprintf (stderr, "res = %d \n", res); +#endif + + if (res != 3565793) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 1 "linterchange"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-10.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-10.c new file mode 100644 index 0000000..610610b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-10.c @@ -0,0 +1,43 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +#define M 256 +int a[M][M], b[M][M]; +int __attribute__((noinline)) +double_reduc (int n) +{ + int sum = 0; + for (int j = 0; j < n; j++) + { + for (int i = 0; i < n; i++) + sum = sum + a[i][j]*b[i][j]; + } + return sum; +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int i) +{ + for (int j = 0; j < M; j++) + { + a[i][j] = i; + b[i][j] = j; + } +} + +int main (void) +{ + for (int i = 0; i < M; ++i) + init (i); + + int sum = double_reduc (M); + + if (sum != 1065369600) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 1 "linterchange" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-2.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-2.c new file mode 100644 index 0000000..8341a22 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-2.c @@ -0,0 +1,58 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +/* Copied from graphite/interchange-5.c */ + +#define DEBUG 0 +#if DEBUG +#include +#endif + +#define N 100 +#define M 1111 +int A[N][M]; + +static int __attribute__((noinline)) +foo (void) +{ + int i, j; + + for( i = 0; i < M; i++) + for( j = 0; j < N; j++) + A[j][i] = 5 * A[j][i]; + + return A[0][0] + A[N-1][M-1]; +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int i) +{ + int j; + + for (j = 0; j < M; j++) + A[i][j] = 2; +} + +int +main (void) +{ + int i, j, res; + + for (i = 0; i < N; i++) + init (i); + + res = foo (); + +#if DEBUG + fprintf (stderr, "res = %d \n", res); +#endif + + if (res != 20) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 1 "linterchange"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-3.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-3.c new file mode 100644 index 0000000..ca2a114 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-3.c @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +/* Copied from graphite/interchange-6.c */ + +#define DEBUG 0 +#if DEBUG +#include +#endif + +#define N 100 +#define M 200 + +static int __attribute__((noinline)) +foo (int A[N][M]) +{ + int i, j; + + /* This loop should be interchanged. */ + for(j = 0; j < M; j++) + for(i = 0; i < N; i++) + A[i][j] = A[i][j] + A[i][j]; + + return A[0][0] + A[N-1][M-1]; +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int *arr, int i) +{ + int j; + + for (j = 0; j < M; j++) + arr[j] = 2; +} + +int +main (void) +{ + int A[N][M]; + int i, j, res; + + for (i = 0; i < N; i++) + init (A[i], i); + + res = foo (A); + +#if DEBUG + fprintf (stderr, "res = %d \n", res); +#endif + + if (res != 8) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 1 "linterchange"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-4.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-4.c new file mode 100644 index 0000000..ff820f3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-4.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +/* Copied from graphite/interchange-7.c */ + +#define DEBUG 0 +#if DEBUG +#include +#endif + +#define N 111 +#define M 1111 + +static int __attribute__((noinline)) +foo (double *a) +{ + int i,j; + int r = 0; + + for (i = 0; i < N; ++i) + for (j = 0; j < M; ++j) + r += a[j * N + i]; + + return r; +} + +extern void abort (); + +int +main (void) +{ + double A[N*M]; + int i, res; + + for (i = 0; i < N*M; i++) + A[i] = 2; + + res = foo (A); + +#if DEBUG + fprintf (stderr, "res = %d \n", res); +#endif + + if (res != 246642) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 1 "linterchange"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-5.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-5.c new file mode 100644 index 0000000..706da88 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-5.c @@ -0,0 +1,71 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +#define M 256 +int a[M][M], b[M][M], c[M][M], d[M][M]; +void __attribute__((noinline)) +matrix_mul_1 (int n) +{ + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; +} + +void __attribute__((noinline)) +matrix_mul_2 (int n) +{ + for (int i = 0; i < n; i++) + { + for (int j = 0; j < n; j++) + { + for (int k = 0; k < n; k++) + d[i][j] = d[i][j] + a[i][k]*b[k][j]; + + asm volatile ("" ::: "memory"); + } + asm volatile ("" ::: "memory"); + } +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int i) +{ + for (int j = 0; j < M; j++) + { + a[i][j] = i; + b[i][j] = j; + c[i][j] = 0; + d[i][j] = 0; + } +} + +static int __attribute__((noinline)) +check (int i) +{ + for (int j = 0; j < M; j++) + if (c[i][j] != d[i][j]) + return 0; + + return 1; +} + +int main (void) +{ + for (int i = 0; i < M; ++i) + init (i); + + matrix_mul_1 (M); + matrix_mul_2 (M); + + for (int i = 0; i < M; ++i) + if (!check (i)) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 1 "linterchange" } } */ +/* { dg-final { scan-tree-dump-times "Loop_pair is not interchanged" 1 "linterchange" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-6.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-6.c new file mode 100644 index 0000000..97555ed --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-6.c @@ -0,0 +1,70 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +#define M 256 +int a[M][M], b[M][M], c[M][M], d[M][M]; +void __attribute__((noinline)) +matrix_mul_1 (int n) +{ + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + for (int i = 0; i < n; i++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; +} + +void __attribute__((noinline)) +matrix_mul_2 (int n) +{ + for (int i = 0; i < n; i++) + { + for (int j = 0; j < n; j++) + { + for (int k = 0; k < n; k++) + d[i][j] = d[i][j] + a[i][k]*b[k][j]; + + asm volatile ("" ::: "memory"); + } + asm volatile ("" ::: "memory"); + } +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int i) +{ + for (int j = 0; j < M; j++) + { + a[i][j] = i; + b[i][j] = j; + c[i][j] = 0; + d[i][j] = 0; + } +} + +static int __attribute__((noinline)) +check (int i) +{ + for (int j = 0; j < M; j++) + if (c[i][j] != d[i][j]) + return 0; + + return 1; +} + +int main (void) +{ + for (int i = 0; i < M; ++i) + init (i); + + matrix_mul_1 (M); + matrix_mul_2 (M); + + for (int i = 0; i < M; ++i) + if (!check (i)) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 2 "linterchange" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-7.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-7.c new file mode 100644 index 0000000..b93ca78 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-7.c @@ -0,0 +1,70 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +#define M 256 +int a[M][M], b[M][M], c[M][M], d[M][M]; +void __attribute__((noinline)) +matrix_mul_1 (int n) +{ + for (int k = 0; k < n; k++) + for (int j = 0; j < n; j++) + for (int i = 0; i < n; i++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; +} + +void __attribute__((noinline)) +matrix_mul_2 (int n) +{ + for (int i = 0; i < n; i++) + { + for (int j = 0; j < n; j++) + { + for (int k = 0; k < n; k++) + d[i][j] = d[i][j] + a[i][k]*b[k][j]; + + asm volatile ("" ::: "memory"); + } + asm volatile ("" ::: "memory"); + } +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int i) +{ + for (int j = 0; j < M; j++) + { + a[i][j] = i; + b[i][j] = j; + c[i][j] = 0; + d[i][j] = 0; + } +} + +static int __attribute__((noinline)) +check (int i) +{ + for (int j = 0; j < M; j++) + if (c[i][j] != d[i][j]) + return 0; + + return 1; +} + +int main (void) +{ + for (int i = 0; i < M; ++i) + init (i); + + matrix_mul_1 (M); + matrix_mul_2 (M); + + for (int i = 0; i < M; ++i) + if (!check (i)) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 2 "linterchange" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-8.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-8.c new file mode 100644 index 0000000..29f5917 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-8.c @@ -0,0 +1,70 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +#define M 256 +int a[M][M], b[M][M], c[M][M], d[M][M]; +void __attribute__((noinline)) +matrix_mul_1 (int n) +{ + for (int i = 0; i < n; i++) + for (int k = 0; k < n; k++) + for (int j = 0; j < n; j++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; +} + +void __attribute__((noinline)) +matrix_mul_2 (int n) +{ + for (int i = 0; i < n; i++) + { + for (int j = 0; j < n; j++) + { + for (int k = 0; k < n; k++) + d[i][j] = d[i][j] + a[i][k]*b[k][j]; + + asm volatile ("" ::: "memory"); + } + asm volatile ("" ::: "memory"); + } +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int i) +{ + for (int j = 0; j < M; j++) + { + a[i][j] = i; + b[i][j] = j; + c[i][j] = 0; + d[i][j] = 0; + } +} + +static int __attribute__((noinline)) +check (int i) +{ + for (int j = 0; j < M; j++) + if (c[i][j] != d[i][j]) + return 0; + + return 1; +} + +int main (void) +{ + for (int i = 0; i < M; ++i) + init (i); + + matrix_mul_1 (M); + matrix_mul_2 (M); + + for (int i = 0; i < M; ++i) + if (!check (i)) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-not "Loop_pair is interchanged" "linterchange" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-9.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-9.c new file mode 100644 index 0000000..d6a3f5c --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-interchange-9.c @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-interchange -fdump-tree-linterchange-details" } */ + +#define M 256 +int a[M][M], b[M][M], c[M], d[M]; +void __attribute__((noinline)) +simple_reduc_1 (int n) +{ + for (int j = 0; j < n; j++) + { + int sum = c[j]; + for (int i = 0; i < n; i++) + sum = sum + a[i][j]*b[i][j]; + + c[j] = sum; + } +} + +void __attribute__((noinline)) +simple_reduc_2 (int n) +{ + for (int j = 0; j < n; j++) + { + int sum = d[j]; + for (int i = 0; i < n; i++) + sum = sum + a[i][j]*b[i][j]; + + asm volatile ("" ::: "memory"); + d[j] = sum; + } +} + +extern void abort (); + +static void __attribute__((noinline)) +init (int i) +{ + c[i] = 0; + d[i] = 0; + for (int j = 0; j < M; j++) + { + a[i][j] = i; + b[i][j] = j; + } +} + +int main (void) +{ + for (int i = 0; i < M; ++i) + init (i); + + simple_reduc_1 (M); + simple_reduc_2 (M); + + for (int i = 0; i < M; ++i) + if (c[i] != d[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Loop_pair is interchanged" 1 "linterchange" } } */ diff --git a/gcc/timevar.def b/gcc/timevar.def index 8cec6af..730a1dc 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -184,6 +184,7 @@ DEFTIMEVAR (TV_TREE_LOOP , "tree loop optimization") DEFTIMEVAR (TV_TREE_NOLOOP , "loopless fn") DEFTIMEVAR (TV_TREE_LOOP_BOUNDS , "tree loop bounds") DEFTIMEVAR (TV_LIM , "tree loop invariant motion") +DEFTIMEVAR (TV_LINTERCHANGE , "tree loop interchange") DEFTIMEVAR (TV_TREE_LOOP_IVCANON , "tree canonical iv") DEFTIMEVAR (TV_SCEV_CONST , "scev constant prop") DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching") diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 9f76d82..fe47736 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -367,6 +367,7 @@ extern gimple_opt_pass *make_pass_tree_loop (gcc::context *ctxt); extern gimple_opt_pass *make_pass_tree_no_loop (gcc::context *ctxt); extern gimple_opt_pass *make_pass_tree_loop_init (gcc::context *ctxt); extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_linterchange (gcc::context *ctxt); extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt); extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt); diff --git a/gcc/tree-ssa-loop-interchange.cc b/gcc/tree-ssa-loop-interchange.cc new file mode 100644 index 0000000..c835d02 --- /dev/null +++ b/gcc/tree-ssa-loop-interchange.cc @@ -0,0 +1,2015 @@ +/* Loop invariant motion. + Copyright (C) 2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +GCC is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "tree.h" +#include "gimple.h" +#include "tree-pass.h" +#include "ssa.h" +#include "gimple-pretty-print.h" +#include "fold-const.h" +#include "gimplify.h" +#include "gimple-iterator.h" +#include "cfgloop.h" +#include "params.h" +#include "tree-scalar-evolution.h" +#include "tree-ssa-loop-manip.h" +#include "tree-ssa-loop-niter.h" +#include "tree-ssa-loop-ivopts.h" +#include "tree-data-ref.h" + +/* This pass performs loop interchange: for example, the loop nest + + for (int j = 0; j < N; j++) + for (int k = 0; k < N; k++) + for (int i = 0; i < N; i++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; + + is transformed to + + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + for (int k = 0; k < N; k++) + c[i][j] = c[i][j] + a[i][k]*b[k][j]; + + This pass implements loop interchange in the following steps: + + 1) Find perfect loop nest for each innermost loop and compute data + dependence relations for it. For above example, loop nest is + . + 2) From innermost to outermost loop, this pass tries to interchange + each loop pair. For above case, it firstly tries to interchange + and loop nest becomes . + Then it tries to interchange and loop nest becomes + . The overall effect is to move innermost + loop to the outermost position. For loop pair + to be interchanged, we: + 3) Check if data dependence relations are valid for loop interchange. + 4) Check if both loops can be interchanged in terms of transformation. + 5) Check if interchanging the two loops is profitable. + 6) Interchange the two loops by mapping induction variables. + + This pass also handles reductions in loop nest. So far we only support + simple reduction of inner loop and double reduction of the loop nest. */ + +/* Maximum number of stmts in each loop that should be interchanged. */ +#define MAX_NUM_STMT (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_MAX_NUM_STMTS)) + +/* Default size for each array dimension. */ +#define AVG_DIM_SIZE (PARAM_VALUE (PARAM_AVG_LOOP_NITER)) + +/* Comparison ratio of access stride between inner/outer loops to be + interchanged. This is the minimum stride ratio for loop interchange + to be profitable. */ +#define OUTER_STRIDE_RATIO (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_STRIDE_RATIO)) +/* The same as above, but we require higher ratio for interchanging the + innermost two loops. */ +#define INNER_STRIDE_RATIO ((OUTER_STRIDE_RATIO) + 1) + +/* Structure recording loop induction variable. */ +typedef struct induction +{ + /* IV itself. */ + tree var; + /* IV's base and step part of SCEV. */ + tree base; + tree step; + /* Mapped IV variabled used for interchanging loops. */ + tree mapped_var; +}*induction_p; + +/* Enum type for loop reduction variable. */ +enum reduction_type +{ + UNKNOWN_RTYPE = 0, + SIMPLE_RTYPE, + DOUBLE_RTYPE +}; + +/* Structure recording loop reduction variable. */ +typedef struct reduction +{ + /* Reduction itself. */ + tree var; + /* PHI node defining reduction variable. */ + gphi *phi; + /* Init and next variables of the reduction. */ + tree init; + tree next; + /* Lcssa PHI node if reduction is used outside of its definition loop. */ + gphi *lcssa_phi; + /* Single use of reduction variable. This is generally but not necessarily + the stmt defining next variable of reduction. */ + gimple *single_use; + /* Stmts defining init and next. */ + gimple *producer; + gimple *consumer; + /* If init is loaded from memory, this is the loading memory reference. */ + tree init_ref; + /* If reduction is finally stored to memory, this is the stored memory + reference. */ + tree fini_ref; + enum reduction_type type; +}*reduction_p; + + +/* Dump reduction RE. */ + +static void +dump_reduction (reduction_p re) +{ + if (re->type == SIMPLE_RTYPE) + fprintf (dump_file, " Simple reduction: "); + else if (re->type == DOUBLE_RTYPE) + fprintf (dump_file, " Double reduction: "); + else + fprintf (dump_file, " Unknown reduction: "); + + print_gimple_stmt (dump_file, re->phi, 0); +} + +/* Dump LOOP's induction IV. */ +static void +dump_induction (struct loop *loop, induction_p iv) +{ + fprintf (dump_file, " Induction: "); + print_generic_expr (dump_file, iv->var, TDF_SLIM); + fprintf (dump_file, " = {"); + print_generic_expr (dump_file, iv->base, TDF_SLIM); + fprintf (dump_file, ", "); + print_generic_expr (dump_file, iv->step, TDF_SLIM); + fprintf (dump_file, "}_%d\n", loop->num); +} + +/* Loop candidate for interchange. */ + +class loop_cand +{ +public: + loop_cand (struct loop *, struct loop *); + ~loop_cand (); + + reduction_p find_reduction_by_init (tree); + reduction_p find_reduction_by_stmt (gimple *); + void classify_simple_reduction (reduction_p); + bool analyze_iloop_reduction_var (tree); + bool analyze_oloop_reduction_var (loop_cand *, tree); + bool analyze_reduction_var (loop_cand *, tree); + bool analyze_induction_var (tree, tree); + bool analyze_carried_vars (loop_cand *); + bool analyze_lcssa_phis (void); + bool can_interchange_p (loop_cand *); + bool unsupported_operation (basic_block, loop_cand *); + void undo_simple_reduction (reduction_p); + void eliminate_dead_code (void); + + friend class tree_loop_interchange; +private: + /* The loop itself. */ + struct loop *loop; + /* The outer loop of loop nest for interchange. */ + struct loop *nest; + /* Vector of induction variables in loop. */ + vec inductions; + /* Vector of reduction variables in loop. */ + vec reductions; + /* Lcssa PHI nodes of this loop. */ + vec lcssa_nodes; + /* # of iterations of this loop. */ + tree niters; + /* Single exit edge of this loop. */ + edge exit; + /* Basic blocks of this loop. */ + basic_block *bbs; + /* # of stmts of this loop. */ + int num_stmts; +}; + +/* Constructor. */ + +loop_cand::loop_cand (struct loop *loop, struct loop *nest) + : loop (loop), nest (nest), + niters (unshare_expr (number_of_latch_executions (loop))), + exit (single_exit (loop)), bbs (get_loop_body (loop)), num_stmts (0) +{ + inductions.create (3); + reductions.create (3); + lcssa_nodes.create (3); +} + +/* Destructor. */ + +loop_cand::~loop_cand () +{ + induction_p iv; + for (unsigned i = 0; inductions.iterate (i, &iv); ++i) + free (iv); + + reduction_p re; + for (unsigned i = 0; reductions.iterate (i, &re); ++i) + free (iv); + + inductions.release (); + reductions.release (); + lcssa_nodes.release (); + free (bbs); +} + +/* Return single use stmt of VAR in LOOP, otherwise return NULL. */ + +static gimple * +single_use_in_loop (tree var, struct loop *loop) +{ + gimple *stmt, *res = NULL; + use_operand_p use_p; + imm_use_iterator iterator; + + FOR_EACH_IMM_USE_FAST (use_p, iterator, var) + { + stmt = USE_STMT (use_p); + if (is_gimple_debug (stmt)) + continue; + + basic_block bb = gimple_bb (stmt); + gcc_assert (bb != NULL); + if (!flow_bb_inside_loop_p (loop, bb)) + continue; + + if (res) + return NULL; + + res = stmt; + } + return res; +} + +/* Return true if E is unsupported in loop interchange, i.e, E is a complex + edge or part of irreducible loop. */ + +static inline bool +unsupported_edge (edge e) +{ + return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP)); +} + +/* Return true if PHI is unsupported in loop interchange, i.e, PHI contains + ssa var appearing in any abnormal phi node. */ + +static inline bool +unsupported_phi_node (gphi *phi) +{ + if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (PHI_RESULT (phi))) + return true; + + for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i) + { + tree arg = PHI_ARG_DEF (phi, i); + if (TREE_CODE (arg) == SSA_NAME + && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (arg)) + return true; + } + + return false; +} + +/* Return reduction whose init variable is VAR, otherwise return NULL. */ + +reduction_p +loop_cand::find_reduction_by_init (tree var) +{ + reduction_p re; + + for (unsigned i = 0; reductions.iterate (i, &re); ++i) + if (re->init == var || operand_equal_p (re->init, var, 0)) + return re; + + return NULL; +} + +/* Return the reduction if STMT is one of its lcssa PHI, producer or consumer + stmt. */ + +reduction_p +loop_cand::find_reduction_by_stmt (gimple *stmt) +{ + gphi *phi = NULL; + reduction_p re; + + if (is_a (stmt)) + phi = as_a (stmt); + + for (unsigned i = 0; reductions.iterate (i, &re); ++i) + if ((phi != NULL && phi == re->lcssa_phi) + || (stmt == re->producer || stmt == re->consumer)) + return re;; + + return NULL; +} + +/* Return true if all stmts in BB can be supported by loop interchange, + otherwise return false. ILOOP is not NULL if this loop_cand is the + outer loop in loop nest. */ + +bool +loop_cand::unsupported_operation (basic_block bb, loop_cand *iloop) +{ + int bb_num_stmts = 0; + gphi_iterator psi; + gimple_stmt_iterator gsi; + + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (is_gimple_debug (stmt)) + continue; + + if (gimple_has_volatile_ops (stmt) + || gimple_has_side_effects (stmt)) + return false; + + bb_num_stmts++; + if (is_gimple_call (stmt)) + { + int cflags = gimple_call_flags (stmt); + /* Only support const/pure calls. */ + if (!(cflags & (ECF_CONST | ECF_PURE))) + return false; + + /* In basic block of outer loop, the call should be cheap since + it will be moved to inner loop. */ + if (iloop != NULL + && !gimple_inexpensive_call_p (as_a (stmt))) + return false; + + continue; + } + + if (!iloop || !gimple_vuse (stmt)) + continue; + + /* Support stmt accessing memory in outer loop only if it is for inner + loop's reduction. */ + if (iloop->find_reduction_by_stmt (stmt)) + continue; + + tree lhs; + /* Or it's invariant memory reference and only used by inner loop. */ + if (gimple_assign_single_p (stmt) + && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE + && TREE_CODE (lhs) == SSA_NAME + && single_use_in_loop (lhs, iloop->loop)) + continue; + + return false; + } + num_stmts += bb_num_stmts; + + /* Allow PHI nodes in any basic block of inner loop, or PHI nodes in + (outer) loop's header. */ + if (!iloop || bb == loop->header) + return true; + + for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi)) + { + gphi *phi = psi.phi (); + + if (unsupported_phi_node (phi)) + return false; + + if (virtual_operand_p (PHI_RESULT (phi))) + continue; + + /* For outer loop, we only support PHI in loop header and lcssa PHI + of inner loop's reduction. */ + if (!iloop->find_reduction_by_stmt (phi)) + return false; + } + return true; +} + +/* Return true if current loop_cand be interchanged. ILOOP is not NULL if + current loop_cand is outer loop in loop nest. */ + +bool +loop_cand::can_interchange_p (loop_cand *iloop) +{ + /* For now we only support at most one reduction. */ + unsigned allowed_reduction_num = 1; + + /* Only support reduction if the loop nest to be interchanged is the + innermostin two loops. */ + if ((iloop == NULL && loop->inner != NULL) + || (iloop != NULL && iloop->loop->inner != NULL)) + allowed_reduction_num = 0; + + if (reductions.length () > allowed_reduction_num + || (reductions.length () == 1 + && reductions[0]->type == UNKNOWN_RTYPE)) + return false; + + /* Only support lcssa PHI node which is for reduction. */ + if (lcssa_nodes.length () > allowed_reduction_num) + return false; + + /* Check basic blocks other than loop header/exit. */ + for (unsigned i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + + /* Skip basic blocks of inner loops. */ + if (bb->loop_father != loop) + continue; + + /* Check if basic block has any unsupported operation. */ + if (!unsupported_operation (bb, iloop)) + return false; + + /* Check if loop has too many stmts. */ + if (num_stmts > MAX_NUM_STMT) + return false; + } + + return true; +} + +/* Classify if reduction RE is a simple one. */ + +void +loop_cand::classify_simple_reduction (reduction_p re) +{ + gimple *producer, *consumer; + enum tree_code code; + tree lhs, rhs; + + /* Check init variable of reduction and how it is initialized. */ + if (TREE_CODE (re->init) == SSA_NAME) + { + producer = SSA_NAME_DEF_STMT (re->init); + re->producer = producer; + basic_block bb = gimple_bb (producer); + if (!bb || bb->loop_father != nest) + return; + + if (!is_gimple_assign (producer)) + return; + + code = gimple_assign_rhs_code (producer); + if (get_gimple_rhs_class (code) != GIMPLE_SINGLE_RHS) + return; + + if ((lhs = gimple_assign_lhs (producer)) == NULL_TREE + || lhs != re->init) + return; + + if ((rhs = gimple_assign_rhs1 (producer)) == NULL_TREE + || !REFERENCE_CLASS_P (rhs)) + return; + + re->init_ref = rhs; + } + else if (!CONSTANT_CLASS_P (re->init)) + return; + + /* TODO: Don't support constant initializer yet. */ + if (re->init_ref == NULL_TREE) + return; + + /* Check how reduction variable is used. Note usually reduction variable + is used outside of its defining loop, we don't require that in terms + loop interchange. */ + if (re->lcssa_phi == NULL) + consumer = single_use_in_loop (re->next, loop); + else + consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), nest); + + if (consumer == NULL) + return; + + re->consumer = consumer; + + if (!is_gimple_assign (consumer)) + return; + + code = gimple_assign_rhs_code (consumer); + if (get_gimple_rhs_class (code) != GIMPLE_SINGLE_RHS) + return; + + if ((rhs = gimple_assign_rhs1 (consumer)) == NULL_TREE + || rhs != PHI_RESULT (re->lcssa_phi)) + return; + + if ((lhs = gimple_assign_lhs (consumer)) == NULL_TREE + || !REFERENCE_CLASS_P (lhs)) + return; + + re->fini_ref = lhs; + + /* Require memory references in producer and consumer are the same so + that we can undo reduction during interchange. */ + if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0)) + return; + + re->type = SIMPLE_RTYPE; +} + +/* Analyze reduction variable VAR. ILOOP is not NULL if current loop_cand + is outer loop in loop nest. Return true if analysis succeeds. */ + +bool +loop_cand::analyze_reduction_var (loop_cand *iloop, tree var) +{ + if (iloop != NULL) + return analyze_oloop_reduction_var (iloop, var); + else + return analyze_iloop_reduction_var (var); +} + +/* Analyze reduction variable VAR for inner loop of the loop nest to be + interchanged. Return true if analysis succeeds. */ + +bool +loop_cand::analyze_iloop_reduction_var (tree var) +{ + gphi *phi = as_a (SSA_NAME_DEF_STMT (var)); + gphi *lcssa_phi = NULL, *use_phi; + tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); + tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); + edge e = single_exit (loop); + reduction_p re; + gimple *stmt, *next_def, *single_use = NULL; + use_operand_p use_p; + imm_use_iterator iterator; + basic_block bb; + + if (TREE_CODE (next) != SSA_NAME) + return false; + + next_def = SSA_NAME_DEF_STMT (next); + bb = gimple_bb (next_def); + if (!bb || !flow_bb_inside_loop_p (loop, bb)) + return false; + + /* In restricted reduction, the var is (and must be) used in defining + the updated var. The process can be depicted as below: + + var ;; = PHI + | + | + v + +---------------------+ + | reduction operators | <-- other operands + +---------------------+ + | + | + v + next + + In terms loop interchange, we don't change how NEXT is computed based + on VAR and OTHER OPERANDS. In case of double reduction in loop nest + to be interchanged, we don't changed it at all. In the case of simple + reduction in inner loop, we only make change how VAR/NEXT is loaded or + stored. With these conditions, we can relax restrictions on reduction + in a way that reduction operation is seen as black box. In general, + we can ignore reassociation of reduction operator; we can handle fake + reductions in which VAR is not even used to compute NEXT. */ + FOR_EACH_IMM_USE_FAST (use_p, iterator, var) + { + stmt = USE_STMT (use_p); + if (is_gimple_debug (stmt)) + continue; + + bb = gimple_bb (stmt); + if (!bb || !flow_bb_inside_loop_p (loop, bb)) + return false; + + if (single_use != NULL) + return false; + + single_use = stmt; + } + + if (single_use != next_def + && !stmt_dominates_stmt_p (single_use, next_def)) + return false; + + FOR_EACH_IMM_USE_FAST (use_p, iterator, next) + { + stmt = USE_STMT (use_p); + if (is_gimple_debug (stmt)) + continue; + + bb = gimple_bb (stmt); + if (!bb) + return false; + + /* Or else it's used in PHI itself. */ + use_phi = NULL; + if (is_a (stmt) + && (use_phi = as_a (stmt)) != NULL + && use_phi == phi) + continue; + + if (use_phi != NULL + && lcssa_phi == NULL + && bb == e->dest + && PHI_ARG_DEF_FROM_EDGE (use_phi, e) == next) + lcssa_phi = use_phi; + else + return false; + } + re = XCNEW (struct reduction); + re->var = var; + re->init = init; + re->next = next; + re->phi = phi; + re->lcssa_phi = lcssa_phi; + re->single_use = single_use; + + classify_simple_reduction (re); + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_reduction (re); + + reductions.safe_push (re); + return true; +} + +/* Analyze reduction variable VAR for outer loop of the loop nest to be + interchanged. ILOOP is not NULL and points to inner loop. For the + moment, we only support double reduction for outer loop, like: + + for (int i = 0; i < n; i++) + { + int sum = 0; + + for (int j = 0; j < n; j++) // outer loop + for (int k = 0; k < n; k++) // inner loop + sum += a[i][k]*b[k][j]; + + s[i] = sum; + } + + Note the innermost two loops are the loop nest to be interchanged. + Return true if analysis succeeds. */ + +bool +loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var) +{ + gphi *phi = as_a (SSA_NAME_DEF_STMT (var)); + gphi *lcssa_phi = NULL, *use_phi; + tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); + tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); + edge e = single_exit (loop); + reduction_p re; + gimple *stmt, *next_def; + use_operand_p use_p; + imm_use_iterator iterator; + basic_block bb; + + if (TREE_CODE (next) != SSA_NAME) + return false; + + next_def = SSA_NAME_DEF_STMT (next); + bb = gimple_bb (next_def); + if (!bb || !flow_bb_inside_loop_p (loop, bb)) + return false; + + /* Find inner loop's simple reduction that uses var as initializer. */ + reduction_p inner_re = iloop->find_reduction_by_init (var); + if (inner_re == NULL + || inner_re->type != UNKNOWN_RTYPE + || inner_re->producer != phi) + return false; + + /* In case of double reduction, outer loop's reduction should be updated + by inner loop's simple reduction. */ + if (next_def != inner_re->lcssa_phi) + return false; + + /* Outer loop's reduction should only be used to initialize inner loop's + simple reduction. */ + FOR_EACH_IMM_USE_FAST (use_p, iterator, var) + { + stmt = USE_STMT (use_p); + if (is_gimple_debug (stmt)) + continue; + + bb = gimple_bb (stmt); + if (!bb || !flow_bb_inside_loop_p (loop, bb)) + return false; + + if (! is_a (stmt) + || (use_phi = as_a (stmt)) == NULL + || use_phi != inner_re->phi) + return false; + } + + /* Check this reduction is correctly used outside of loop via lcssa phi. */ + FOR_EACH_IMM_USE_FAST (use_p, iterator, next) + { + stmt = USE_STMT (use_p); + if (is_gimple_debug (stmt)) + continue; + + bb = gimple_bb (stmt); + if (!bb) + return false; + + /* Or else it's used in PHI itself. */ + use_phi = NULL; + if (is_a (stmt) + && (use_phi = as_a (stmt)) != NULL + && use_phi == phi) + continue; + + if (lcssa_phi == NULL + && use_phi != NULL + && bb == e->dest + && PHI_ARG_DEF_FROM_EDGE (use_phi, e) == next) + lcssa_phi = use_phi; + else + return false; + } + + re = XCNEW (struct reduction); + re->var = var; + re->init = init; + re->next = next; + re->phi = phi; + re->lcssa_phi = lcssa_phi; + re->type = DOUBLE_RTYPE; + inner_re->type = DOUBLE_RTYPE; + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_reduction (re); + + reductions.safe_push (re); + return true; +} + +/* Return true if VAR is induction variable of current loop whose scev is + specified by CHREC. */ + +bool +loop_cand::analyze_induction_var (tree var, tree chrec) +{ + /* Var is loop invariant, though it's unlikely to happen. */ + if (tree_does_not_contain_chrecs (chrec)) + { + struct induction *iv = XCNEW (struct induction); + iv->var = var; + iv->base = chrec; + iv->step = build_int_cst (TREE_TYPE (chrec), 0); + inductions.safe_push (iv); + return true; + } + + if (TREE_CODE (chrec) != POLYNOMIAL_CHREC + || CHREC_VARIABLE (chrec) != (unsigned) loop->num + || tree_contains_chrecs (CHREC_LEFT (chrec), NULL) + || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL)) + return false; + + struct induction *iv = XCNEW (struct induction); + iv->var = var; + iv->base = CHREC_LEFT (chrec); + iv->step = CHREC_RIGHT (chrec); + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_induction (loop, iv); + + inductions.safe_push (iv); + return true; +} + +/* Return true if all loop carried variables defined in loop header can + be successfully analyzed. */ + +bool +loop_cand::analyze_carried_vars (loop_cand *iloop) +{ + basic_block before_loop = block_before_loop (nest); + gphi_iterator gsi; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\nLoop(%d) carried vars:\n", loop->num); + + for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + + if (unsupported_phi_node (phi)) + return false; + + tree var = PHI_RESULT (phi); + if (virtual_operand_p (var)) + continue; + + tree chrec = analyze_scalar_evolution (loop, var); + chrec = instantiate_scev (before_loop, loop, chrec); + + /* Analyze var as reduction variable. */ + if (chrec_contains_undetermined (chrec) + || chrec_contains_symbols_defined_in_loop (chrec, nest->num)) + { + if (!analyze_reduction_var (iloop, var)) + return false; + } + /* Analyze var as induction variable. */ + else if (!analyze_induction_var (var, chrec)) + return false; + } + + return true; +} + +/* Return TRUE if loop closed PHI nodes can be analyzed successfully. */ + +bool +loop_cand::analyze_lcssa_phis (void) +{ + edge e = single_exit (loop); + gphi_iterator gsi; + + for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + + if (unsupported_phi_node (phi)) + return false; + + if (virtual_operand_p (PHI_RESULT (phi))) + continue; + + /* TODO: We only support lcssa phi for reduction for now. */ + if (!find_reduction_by_stmt (phi)) + return false; + } + + return true; +} + +/* Given inner loop with simple reduction as below: + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + { + int red = c[i][j]; // producer + for (k = 0; k < N; k++) + red += a[i][k] * b[k][j]; + + c[i][j] = red; // consumer + } + + This function undo the reduction and generates below loop nest: + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + { + for (k = 0; k < N; k++) + c[i][j] += a[i][k] * b[k][j]; + } + + This basically reverts transformation done by LIM or PRE. */ + +void +loop_cand::undo_simple_reduction (reduction_p re) +{ + gimple *phi = SSA_NAME_DEF_STMT (re->var); + gimple_stmt_iterator gsi, from; + + /* Move producer stmt into inner loop, just before its use. */ + if (gimple_vuse (re->producer)) + gimple_set_vuse (re->producer, NULL_TREE); + gimple_assign_set_lhs (re->producer, re->var); + from = gsi_for_stmt (re->producer); + gsi = gsi_for_stmt (re->single_use); + gsi_move_before (&from, &gsi); + + /* Delete loop header PHI node of reduction. */ + gsi = gsi_for_stmt (phi); + gsi_remove (&gsi, true); + + /* Move consumer stmt into inner loop, just after its def. */ + if (gimple_vdef (re->consumer)) + gimple_set_vuse (re->consumer, NULL_TREE); + if (gimple_vuse (re->consumer)) + gimple_set_vuse (re->consumer, NULL_TREE); + gimple_assign_set_rhs1 (re->consumer, re->next); + from = gsi_for_stmt (re->consumer); + gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next)); + gsi_move_after (&from, &gsi); + + /* Delete loop closed PHI node of reduction. */ + gsi = gsi_for_stmt (re->lcssa_phi); + gsi_remove (&gsi, true); +} + +/* Eliminate dead code after loop interchange. */ + +void +loop_cand::eliminate_dead_code (void) +{ + /* Check basic blocks other than loop header/exit. */ + for (unsigned i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + + /* Skip basic blocks of inner loops. */ + if (bb->loop_father != loop) + continue; + + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) + { + tree lhs; + gimple *stmt = gsi_stmt (gsi); + + /* Given copy propagation is done during interchange, we can + simply check zero uses of var and eliminate it. */ + if (is_gimple_assign (stmt) + && !gimple_vuse (stmt) + && !gimple_has_volatile_ops (stmt) + && !gimple_has_side_effects (stmt) + && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE + && TREE_CODE (lhs) == SSA_NAME + && has_zero_uses (lhs)) + gsi_remove (&gsi, true); + else + gsi_next (&gsi); + } + } +} + +/* Class for loop interchange transformation. */ + +class tree_loop_interchange +{ +public: + tree_loop_interchange (vec loop_nest, + vec datarefs, vec ddrs) + : loop_nest (loop_nest), datarefs (datarefs), + ddrs (ddrs), niters_iv_var (NULL_TREE), unshare_datarefs_p (true) { } + ~tree_loop_interchange () { + free_dependence_relations (ddrs); + free_data_refs (datarefs); + loop_nest.release (); + } + bool interchange (); + +private: + void update_data_refs (loop_cand &, loop_cand &); + void update_data_deps (unsigned, unsigned); + bool valid_data_dependences (unsigned, unsigned); + bool can_interchange_loops (loop_cand &, loop_cand &); + void interchange_loops (loop_cand &, loop_cand &); + void interchange_reductions (loop_cand &, loop_cand &); + void interchange_inductions (loop_cand &, loop_cand &); + void map_inductions_to_loop (loop_cand &, loop_cand &); + void move_code_to_inner_loop (struct loop *, struct loop *, basic_block *); + + /* Vector of the loop nest. */ + vec loop_nest; + /* Vector of data references in loop nest. */ + vec datarefs; + /* Vector of data dependence relations in loop nest. */ + vec ddrs; + + /* We create new IV which is only used in loop's exit condition check. + In case of 3-level loop nest interchange, when we interchange the + innermost two loops, new IV created in the middle level loop does + not need to be preserved in interchanging the outermost two loops + later. We record the IV so that it can be skipped. */ + tree niters_iv_var; + /* Due to current implementation of data dependence analysis, access + functions are shared by all data dependence relations. After loop + interchange, we need to update data reference/dependence according + to interchanged loops. During updating, this flag will be checked + and DR_ACCESS_FNs will be unshared if it's true. */ + bool unshare_datarefs_p; +}; + +/* Given ILOOP and OLOOP representing two loops in loop nest to be + interchanged, this function decomposes access function ACCESS_FN. + This function does below things: + + 1) Look into ACCESS_FN and record the corresponding polynomial + chrec of ILOOP/OLOOP in ILOOP_CHREC/OLOOP_CHREC. + 2) Record access STRIDE in ILOOP_STRIDE or OLOOP_STRIDE if they + are not NULL. + 4) Record index of dimension DIM in iloop_dim or oloop_dim if + they are not NULL. + + For example, given below ACCESS_FN: + + {{base, step1}_oloop, step2}_iloop + + This function decomposes it into: + + ILOOP_CHREC: {{base, step1}_oloop, step2}_iloop + OLOOP_CHREC: {base, step1}_oloop + + and sets other arguments properly. */ + +static void +decompose_chrecs (unsigned iloop_num, unsigned oloop_num, + tree access_fn, unsigned dim, tree stride, + tree *iloop_chrec, tree *iloop_stride, unsigned *iloop_dim, + tree *oloop_chrec, tree *oloop_stride, unsigned *oloop_dim) +{ + do { + unsigned var = CHREC_VARIABLE (access_fn); + if (var == iloop_num) + { + gcc_assert (*iloop_chrec == NULL_TREE); + *iloop_chrec = access_fn; + if (iloop_stride) + *iloop_stride = stride; + if (iloop_dim) + *iloop_dim = dim; + } + if (var == oloop_num) + { + gcc_assert (*oloop_chrec == NULL_TREE); + *oloop_chrec = access_fn; + if (oloop_stride) + *oloop_stride = stride; + if (oloop_dim) + *oloop_dim = dim; + } + access_fn = CHREC_LEFT (access_fn); + } while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC + && (*iloop_chrec == NULL_TREE || *oloop_chrec == NULL_TREE)); +} + +/* Update data refs to keep them valid after interchanging ILOOP/OLOOP. */ + +void +tree_loop_interchange::update_data_refs (loop_cand &iloop, loop_cand &oloop) +{ + struct data_reference *dr; + for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) + { + unsigned iloop_dim = DR_NUM_DIMENSIONS (dr); + unsigned oloop_dim = DR_NUM_DIMENSIONS (dr); + tree iloop_chrec = NULL_TREE, oloop_chrec = NULL_TREE; + + for (unsigned j = 0; j < DR_NUM_DIMENSIONS (dr); ++j) + { + if (TREE_CODE (DR_ACCESS_FN (dr, j)) != POLYNOMIAL_CHREC) + continue; + + /* Unshare access functions for the first time. */ + if (unshare_datarefs_p) + DR_ACCESS_FN (dr, j) = unshare_expr (DR_ACCESS_FN (dr, j)); + + /* Find the corresponding CHRECs for ILOOP and OLOOP. */ + tree access_fn = DR_ACCESS_FN (dr, j); + decompose_chrecs (iloop.loop->num, oloop.loop->num, + access_fn, j, NULL, + &iloop_chrec, NULL, &iloop_dim, + &oloop_chrec, NULL, &oloop_dim); + } + if (iloop_chrec && oloop_chrec) + { + if (iloop_dim != oloop_dim) + { + /* For data reference with independent CHRECs for both loops, + swap the loop information. For example, + (data_ref ... + {base1, step1}_iloop + {base2, step2}_oloop) + is transformed into: + (data_ref ... + {base1, step1}_oloop + {base2, step2}_iloop). */ + std::swap (CHREC_VAR (iloop_chrec), CHREC_VAR (oloop_chrec)); + } + else + { + /* For data reference with multivariate CHREC for the two loops, + swap step part of CHREC. For example, + (data_ref ... + {{base1, step1}_oloop, step2}_iloop) + is transformed into: + (data_ref ... + {{base1, step2}_oloop, step1}_iloop). */ + std::swap (CHREC_RIGHT (iloop_chrec), CHREC_RIGHT (oloop_chrec)); + } + } + /* For data reference without CHREC for either one of ILOOP/OLOOP, set + the loop information to the other loop. This works because we only + interchange consecutive loops in loop nest. */ + else if (iloop_chrec) + { + tree type = TREE_TYPE (CHREC_VAR (iloop_chrec)); + CHREC_VAR (iloop_chrec) = build_int_cst (type, oloop.loop->num); + } + else if (oloop_chrec) + { + tree type = TREE_TYPE (CHREC_VAR (oloop_chrec)); + CHREC_VAR (oloop_chrec) = build_int_cst (type, iloop.loop->num); + } + } + unshare_datarefs_p = false; +} + +/* Update data dependence relations after interchanging loops. INNER/OUTER + gives index of interchanged loops in loop nest, they are used to access + DIR_MATRIX. */ + +void +tree_loop_interchange::update_data_deps (unsigned inner, unsigned outer) +{ + struct data_dependence_relation *ddr; + + for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i) + { + /* Skip no-dependence case. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + continue; + + for (unsigned j = 0; j < DDR_NUM_DIR_VECTS (ddr); ++j) + { + lambda_vector dir_vect = DDR_DIR_VECT (ddr, j); + std::swap (dir_vect[inner], dir_vect[outer]); + } + } +} + +/* Check data dependence relations, return TRUE if it's valid to interchange + two loops specified by INNER/OUTER. Theoretically, interchanging the two + loops is valid only if direct vector, after interchanging, doesn't have + '>' as the leftmost non-'=' direction. Practically, this function have + been conservative here by not checking some valid cases. */ + +bool +tree_loop_interchange::valid_data_dependences (unsigned inner, unsigned outer) +{ + struct data_dependence_relation *ddr; + + for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i) + { + /* Skip no-dependence case. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + continue; + + for (unsigned j = 0; j < DDR_NUM_DIR_VECTS (ddr); ++j) + { + lambda_vector dist_vect = DDR_DIST_VECT (ddr, j); + unsigned level = dependence_level (dist_vect, loop_nest.length ()); + + /* If there is no carried dependence. */ + if (level == 0) + continue; + + level --; + /* Skip case which has '>' as the leftmost direction. */ + if (!lambda_vector_lexico_pos (dist_vect, level)) + return false; + + /* If dependence is carried by outer loop of the two loops for + interchange. */ + if (level < outer) + continue; + + lambda_vector dir_vect = DDR_DIR_VECT (ddr, j); + /* If directions at both inner/outer levels are the same. */ + if (dir_vect[inner] == dir_vect[outer]) + continue; + + /* Be conservative, skip case if either direction at inner/outer + levels is not '=' or '<'. */ + if (dir_vect[inner] != dir_equal + && dir_vect[inner] != dir_positive + && dir_vect[inner] != dir_independent + && dir_vect[inner] != dir_positive_or_equal) + return false; + + if (dir_vect[outer] != dir_equal + && dir_vect[outer] != dir_positive + && dir_vect[outer] != dir_independent + && dir_vect[outer] != dir_positive_or_equal) + return false; + } + } + + return true; +} + +/* Return true if ILOOP and OLOOP can be interchanged in terms of code + transformation. */ + +bool +tree_loop_interchange::can_interchange_loops (loop_cand &iloop, + loop_cand &oloop) +{ + return (iloop.analyze_carried_vars (NULL) + && iloop.analyze_lcssa_phis () + && oloop.analyze_carried_vars (&iloop) + && oloop.analyze_lcssa_phis () + && iloop.can_interchange_p (NULL) + && oloop.can_interchange_p (&iloop)); +} + +/* Compute and return overall access stride given CHREC and step STRIDE. */ + +static inline unsigned HOST_WIDE_INT +access_stride (tree chrec, tree stride) +{ + unsigned HOST_WIDE_INT uhwi_stride, uhwi_step; + + if (tree_fits_uhwi_p (CHREC_RIGHT (chrec))) + uhwi_step = tree_to_uhwi (CHREC_RIGHT (chrec)); + else + uhwi_step = AVG_DIM_SIZE; + + gcc_assert (tree_fits_uhwi_p (stride)); + uhwi_stride = tree_to_uhwi (stride); + + return uhwi_step * uhwi_stride; +} + +/* Given LOOP, strip off its inner loop's chrec from ACCESS_FN and return + true for stripped case. */ + +static bool +strip_sub_loop_access_fn (struct loop *loop, tree *access_fn) +{ + bool sub_loop_p = false; + do { + struct loop *sub_loop = get_loop (cfun, CHREC_VARIABLE (*access_fn)); + + if (!flow_loop_nested_p (loop, sub_loop)) + break; + + sub_loop_p = true; + *access_fn = CHREC_LEFT (*access_fn); + } while (TREE_CODE (*access_fn) == POLYNOMIAL_CHREC); + + return sub_loop_p; +} + +/* Interchange niters info of ILOOP and OLOOP while reset any other niters + estimates information for now. */ + +static inline void +interchange_nb_iterations (struct loop *iloop, struct loop *oloop) +{ + tree nb_iterations = oloop->nb_iterations; + + oloop->any_upper_bound = false; + oloop->any_likely_upper_bound = false; + free_numbers_of_iterations_estimates (oloop); + + oloop->nb_iterations = iloop->nb_iterations; + + iloop->any_upper_bound = false; + iloop->any_likely_upper_bound = false; + free_numbers_of_iterations_estimates (iloop); + + iloop->nb_iterations = nb_iterations; +} + +/* Interchange two loops specified by ILOOP and OLOOP. */ + +void +tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop) +{ + interchange_reductions (iloop, oloop); + interchange_inductions (iloop, oloop); + + interchange_nb_iterations (iloop.loop, oloop.loop); + + iloop.eliminate_dead_code (); +} + +/* Interchange transformation for reductions of ILOOP and OLOOP. We only + support two types reductions for now: + 1) simple reduction of inner loop. + 2) double reduction of loop nest. + For simple reduction, we simply undo it by moving producer/consumer to + inner loop; for double reduction, we don't need to do anything. */ + +void +tree_loop_interchange::interchange_reductions (loop_cand &iloop, + loop_cand &oloop) +{ + unsigned i; + reduction_p re; + + for (i = 0; iloop.reductions.iterate (i, &re); ++i) + { + if (re->type == DOUBLE_RTYPE) + continue; + + /* Undo simple reductions. */ + iloop.undo_simple_reduction (re); + } + + for (i = 0; oloop.reductions.iterate (i, &re); ++i) + if (re->type != DOUBLE_RTYPE) + gcc_unreachable (); +} + +/* Interchange transformation for inductions of ILOOP and OLOOP. */ + +void +tree_loop_interchange::interchange_inductions (loop_cand &iloop, + loop_cand &oloop) +{ + /* Map outer loop's IV to inner loop. */ + map_inductions_to_loop (oloop, iloop); + /* Map inner loop's IV to outer loop. */ + map_inductions_to_loop (iloop, oloop); + + /* Create canonical IV for both loops. Note canonical IV for outer/inner + loop is actually from inner/outer loop. Also we record the new IV + created for the outer loop so that it can be skipped in later loop + interchange. */ + create_canonical_iv (oloop.loop, single_exit (oloop.loop), iloop.niters, + &niters_iv_var); + create_canonical_iv (iloop.loop, single_exit (iloop.loop), oloop.niters); +} + +/* Map induction variables of SRC loop to TGT loop. The function firstly + creates the same IV of SRC loop in TGT loop, then deletes the original + IV and re-initialize it using the newly created IV. For example, loop + nest: + + for (i = 0; i < N; i++) + for (j = 0; j < M; j++) + { + //use of i; + //use of j; + } + + will be transformed into: + + for (jj = 0; jj < M; jj++) + for (ii = 0; ii < N; ii++) + { + //use of ii; + //use of jj; + } + + after loop interchange. */ + +void +tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt) +{ + induction_p iv; + edge e = single_exit (tgt.loop); + gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi; + bool move_code_p = flow_loop_nested_p (src.loop, tgt.loop); + + /* Move src's code to tgt loop. This is necessary when src is the outer + loop and tgt is the inner loop. */ + if (move_code_p) + move_code_to_inner_loop (src.loop, tgt.loop, src.bbs); + + /* Map source loop's IV to target loop. */ + for (unsigned i = 0; src.inductions.iterate (i, &iv); ++i) + { + gimple *stmt = SSA_NAME_DEF_STMT (iv->var); + gcc_assert (is_a (stmt)); + + /* Delete var definition of the original IV's in the source loop. */ + gsi = gsi_for_stmt (stmt); + gsi_remove (&gsi, true); + + /* No need to map PHI to target loop if it is created in previous + loop interchange. */ + if (niters_iv_var == iv->var) + { + gcc_assert (!move_code_p); + continue; + } + + /* Map the IV by creating the same one in target loop. */ + tree base = unshare_expr (iv->base), step = unshare_expr (iv->step); + create_iv (base, step, SSA_NAME_VAR (iv->var), + tgt.loop, &incr_pos, false, &iv->mapped_var, NULL); + + /* Replace uses of the original IV var with newly created IV var. */ + use_operand_p imm_use_p; + imm_use_iterator iterator; + FOR_EACH_IMM_USE_STMT (stmt, iterator, iv->var) + FOR_EACH_IMM_USE_ON_STMT (imm_use_p, iterator) + SET_USE (imm_use_p, iv->mapped_var); + } +} + +/* Compute the insert position at inner loop when moving code from outer + loop to inner one. */ + +static inline void +insert_pos_at_inner_loop (struct loop *outer, struct loop *inner, + basic_block bb, gimple_stmt_iterator *pos) +{ + if (bb == outer->header || bb == outer->latch) + { + /* Move code from header/latch to header/latch. */ + *pos = gsi_after_labels (inner->header); + } + else + { + /* Otherwise, simply move to exit->src. */ + edge e = single_exit (inner); + *pos = gsi_last_bb (e->src); + } +} + +/* Move stmts of outer loop to inner loop. */ + +void +tree_loop_interchange::move_code_to_inner_loop (struct loop *outer, + struct loop *inner, + basic_block *outer_bbs) +{ + unsigned int i; + edge oloop_exit = single_exit (outer); + gimple_stmt_iterator insert_pos, gsi; + + for (i = 0; i < outer->num_nodes; i++) + { + basic_block bb = outer_bbs[i]; + + /* Skip basic blocks of inner loop. */ + if (flow_bb_inside_loop_p (inner, bb)) + continue; + + insert_pos_at_inner_loop (outer, inner, bb, &insert_pos); + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) + { + gimple *stmt = gsi_stmt (gsi); + if (gimple_code (stmt) == GIMPLE_LABEL) + { + gsi_next (&gsi); + continue; + } + + if (oloop_exit->src == bb + && stmt == gsi_stmt (gsi_last_bb (oloop_exit->src))) + { + gsi_next (&gsi); + continue; + } + + if (gimple_vuse (stmt)) + gimple_set_vuse (stmt, NULL_TREE); + if (gimple_vdef (stmt)) + gimple_set_vdef (stmt, NULL_TREE); + + gsi_move_before (&gsi, &insert_pos); + } + } +} + +/* Return true if interchanging ILOOP/OLOOP is profitable. The function + computes and compares three types information from all DATAREFS: + 1) Access stride for ILOOP and OLOOP. + 2) Number of invariant memory references with respect to ILOOP before + and after loop interchange. + 3) Flags indicating if all memory references access sequential memory + in ILOOP, before and after loop interchange. + This function also dumps information if DUMP_INFO_P is true. */ + +static bool +should_interchange_loops (struct loop *iloop, struct loop *oloop, + vec datarefs, + bool dump_info_p = true) +{ + unsigned HOST_WIDE_INT ratio; + unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0; + struct data_reference *dr; + bool all_seq_dr_before_p = true, all_seq_dr_after_p = true; + widest_int iloop_strides = 0, oloop_strides = 0; + + if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n"); + + for (i = 0; datarefs.iterate (i, &dr); ++i) + { + bool sub_loop_p = false; + tree ref = DR_REF (dr), stride; + tree iloop_chrec = NULL_TREE, iloop_stride = NULL_TREE; + tree oloop_chrec = NULL_TREE, oloop_stride = NULL_TREE; + + /* Get CHREC and the corresponding stride for ILOOP/OLOOP. */ + for (j = 0; j < DR_NUM_DIMENSIONS (dr); ++j) + { + while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) + ref = TREE_OPERAND (ref, 0); + + stride = integer_one_node; + if (TREE_CODE (ref) == ARRAY_REF) + { + stride = TYPE_SIZE_UNIT (TREE_TYPE (ref)); + ref = TREE_OPERAND (ref, 0); + } + + tree access_fn = DR_ACCESS_FN (dr, j); + if (TREE_CODE (access_fn) != POLYNOMIAL_CHREC) + continue; + + sub_loop_p |= strip_sub_loop_access_fn (iloop, &access_fn); + if (TREE_CODE (access_fn) != POLYNOMIAL_CHREC) + continue; + + decompose_chrecs (iloop->num, oloop->num, + access_fn, j, stride, + &iloop_chrec, &iloop_stride, NULL, + &oloop_chrec, &oloop_stride, NULL); + } + + if (!iloop_chrec && !oloop_chrec) + continue; + + /* If stride is unknown for inner or outer loop. */ + if ((iloop_chrec != NULL + && (iloop_stride == NULL_TREE + || !tree_fits_uhwi_p (iloop_stride))) + || (oloop_chrec != NULL + && (oloop_stride == NULL_TREE + || !tree_fits_uhwi_p (oloop_stride)))) + continue; + + /* Compute overall access strides for ILOOP. */ + unsigned HOST_WIDE_INT t1 = 0, t2 = 0; + if (iloop_chrec) + { + t1 = access_stride (iloop_chrec, iloop_stride); + iloop_strides = wi::add (iloop_strides, t1); + } + else if (!sub_loop_p) + num_old_inv_drs++; + + /* Compute overall access strides for OLOOP. */ + if (oloop_chrec) + { + t2 = access_stride (oloop_chrec, oloop_stride); + oloop_strides = wi::add (oloop_strides, t2); + } + else if (!sub_loop_p) + num_new_inv_drs++; + + /* Track if all data references access sequential memory before and + after loop interchange. */ + if (sub_loop_p) + { + /* Data ref can't be sequential if it evaluates wrto any sub loop + of inner loop. */ + all_seq_dr_before_p = false; + all_seq_dr_after_p = false; + } + else if ((stride = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)))) != NULL_TREE + && tree_fits_uhwi_p (stride)) + { + /* Check if data ref access sequential memory wrto inner loop. + Note invariant is considered sequential. */ + unsigned HOST_WIDE_INT uhwi_stride = tree_to_uhwi (stride); + if (t1 != 0 && t1 != uhwi_stride) + all_seq_dr_before_p = false; + if (t2 != 0 && t2 != uhwi_stride) + all_seq_dr_after_p = false; + } + + if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\t"); + print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM); + fprintf (dump_file, ":\t\t" HOST_WIDE_INT_PRINT_DEC + "\t" HOST_WIDE_INT_PRINT_DEC "\n", t1, t2); + } + } + + if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\toverall:\t\t"); + print_decu (iloop_strides, dump_file); + fprintf (dump_file, "\t"); + print_decu (oloop_strides, dump_file); + fprintf (dump_file, "\n"); + + fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n", + num_old_inv_drs, num_new_inv_drs); + fprintf (dump_file, "Consecutive stride: before(%s), after(%s)\n", + all_seq_dr_before_p ? "true" : "false", + all_seq_dr_after_p ? "true" : "false"); + } + + /* We use different stride comparison ratio for interchanging innermost + two loops or not. The idea is to be conservative in interchange for + the innermost loops. */ + ratio = !iloop->inner ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO; + /* Do interchange if it gives better data locality behavior. */ + if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio))) + return true; + if (wi::gtu_p (iloop_strides, oloop_strides)) + { + /* Or it creates more invariant memory references. */ + if ((!all_seq_dr_before_p || all_seq_dr_after_p) + && num_new_inv_drs > num_old_inv_drs) + return true; + /* Or it makes all memory references sequential. */ + if (num_new_inv_drs >= num_old_inv_drs + && !all_seq_dr_before_p && all_seq_dr_after_p) + return true; + } + + return false; +} + +/* Try to interchange inner loop of a loop nest to outer level. */ + +bool +tree_loop_interchange::interchange () +{ + bool changed_p = false; + /* In each iteration we try to interchange I-th loop with (I+1)-th loop. + The overall effect is to push inner loop to outermost level in whole + loop nest. */ + for (unsigned i = loop_nest.length (); i > 1; --i) + { + unsigned inner = i - 1, outer = i - 2; + + /* Check validity for loop interchange. */ + if (!valid_data_dependences (inner, outer)) + break; + + loop_cand iloop (loop_nest[inner], loop_nest[outer]); + loop_cand oloop (loop_nest[outer], loop_nest[outer]); + + /* Check if we can do transformation for loop interchange. */ + if (!can_interchange_loops (iloop, oloop)) + break; + + /* Check profitability for loop interchange. */ + if (should_interchange_loops (iloop.loop, oloop.loop, datarefs)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Loop_pair is interchanged\n\n", + oloop.loop->num, iloop.loop->num); + + interchange_loops (iloop, oloop); + /* Update data structures for further loop interchange. */ + update_data_refs (iloop, oloop); + update_data_deps (inner, outer); + changed_p = true; + } + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Loop_pair is not interchanged\n\n", + oloop.loop->num, iloop.loop->num); + } + } + + return changed_p; +} + + +/* Loop interchange pass. */ + +namespace { + +const pass_data pass_data_linterchange = +{ + GIMPLE_PASS, /* type */ + "linterchange", /* name */ + OPTGROUP_LOOP, /* optinfo_flags */ + TV_LINTERCHANGE, /* tv_id */ + PROP_cfg, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_linterchange : public gimple_opt_pass +{ +public: + pass_linterchange (gcc::context *ctxt) + : gimple_opt_pass (pass_data_linterchange, ctxt) + {} + + /* opt_pass methods: */ + opt_pass * clone () { return new pass_linterchange (m_ctxt); } + virtual bool gate (function *) { return flag_tree_loop_interchange; } + virtual unsigned int execute (function *); + +}; // class pass_linterchange + + +/* Return true if LOOP has proper form for interchange. */ + +static bool +proper_loop_form_for_interchange (struct loop *loop) +{ + edge e0, e1, exit; + + /* Don't interchange if loop has unsupported information for the moment. */ + if (loop->safelen > 0 + || loop->constraints != 0 + || loop->can_be_parallel + || loop->dont_vectorize + || loop->force_vectorize + || loop->in_oacc_kernels_region + || loop->orig_loop_num != 0 + || loop->simduid != NULL_TREE) + return false; + + /* Don't interchange if outer loop has basic block other than header, + exit->src and latch. In general, only below form of loop nest: + header<---+ + | | + v | + INNER_LOOP | + | | + v | + exit--->latch + is supported. */ + if (loop->inner != NULL + && loop->num_nodes != loop->inner->num_nodes + 3) + return false; + + if ((exit = single_dom_exit (loop)) == NULL) + return false; + + /* Check control flow on loop header/exit blocks. */ + if (loop->header == exit->src + && (EDGE_COUNT (loop->header->preds) != 2 + || EDGE_COUNT (loop->header->succs) != 2)) + return false; + else if (loop->header != exit->src + && (EDGE_COUNT (loop->header->preds) != 2 + || !single_succ_p (loop->header) + || unsupported_edge (single_succ_edge (loop->header)) + || EDGE_COUNT (exit->src->succs) != 2 + || !single_pred_p (exit->src) + || unsupported_edge (single_pred_edge (exit->src)))) + return false; + + e0 = EDGE_PRED (loop->header, 0); + e1 = EDGE_PRED (loop->header, 1); + if (unsupported_edge (e0) || unsupported_edge (e1) + || (e0->src != loop->latch && e1->src != loop->latch) + || (e0->src->loop_father == loop && e1->src->loop_father == loop)) + return false; + + e0 = EDGE_SUCC (exit->src, 0); + e1 = EDGE_SUCC (exit->src, 1); + if (unsupported_edge (e0) || unsupported_edge (e1) + || (e0->dest != loop->latch && e1->dest != loop->latch) + || (e0->dest->loop_father == loop && e1->dest->loop_father == loop)) + return false; + + return true; +} + +/* Return true if any two adjacent loops in loop nest OUTERMOST_LOOP should + be interchanged by looking into all DATAREFS. INNERMOST_LOOP is the + innermost loop of this loop nest. */ + +static bool +should_interchange_loop_nest (struct loop *outermost_loop, + struct loop *innermost_loop, + vec datarefs) +{ + struct loop *inner = innermost_loop, *outer; + + /* Check if any two adjacent loops should be interchanged. */ + while (inner != outermost_loop) + { + outer = loop_outer (inner); + + if (should_interchange_loops (inner, outer, datarefs, false)) + return true; + + inner = outer; + } + + return false; +} + +/* Given loop nest LOOP_NEST and data references DATAREFS, compute data + dependences for loop interchange and store it in DDRS. Note we compute + dependences directly rather than call generic interface so that we can + return on unknown dependence instantly. */ + +static bool +tree_loop_interchange_compute_ddrs (vec loop_nest, + vec datarefs, + vec *ddrs) +{ + struct data_reference *a, *b; + + for (unsigned i = 0; datarefs.iterate (i, &a); ++i) + for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j) + if (DR_IS_WRITE (a) || DR_IS_WRITE (b)) + { + ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest); + ddrs->safe_push (ddr); + compute_affine_dependence (ddr, loop_nest[0]); + + /* Give up if ddr is unknown dependence or classic direct vector + is not available. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know + || (DDR_ARE_DEPENDENT (ddr) == NULL_TREE + && DDR_NUM_DIR_VECTS (ddr) == 0)) + return false; + } + + return true; +} + +/* Prune DATAREFS by removing any data reference not inside of LOOP. */ + +static inline void +prune_datarefs_not_in_loop (struct loop *loop, vec datarefs) +{ + struct data_reference *dr; + + for (unsigned i = 0; datarefs.iterate (i, &dr);) + if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr)))) + i++; + else + datarefs.ordered_remove (i); +} + +/* Given loop nest like , the function strips off outer + loops if it forms non-rectangle loop nest. The outermost loop of + the rest rectangle loop nest or NULL is returned. */ + +struct loop * +prune_non_rectangle_loop_nest (struct loop *iloop, struct loop *oloop) +{ + if (!oloop || !iloop || oloop == iloop) + return NULL; + + struct loop *loop1 = iloop; + + while (loop1 != NULL && flow_loop_nested_p (oloop, loop1)) + { + tree niters = number_of_latch_executions (loop1); + struct loop *loop2 = loop_outer (loop1); + + while (loop2 != NULL + && (loop2 == oloop || flow_loop_nested_p (oloop, loop2))) + { + /* Strip off the outermost loop if it isn't rectangle loop nest. */ + if (chrec_contains_symbols_defined_in_loop (niters, loop2->num)) + { + oloop = loop2->inner; + break; + } + + loop2 = loop_outer (loop2); + } + if (oloop == iloop) + return NULL; + + loop1 = loop_outer (loop1); + } + + return oloop; +} + +/* Given innermost LOOP, return true if perfect loop nest can be found and + data dependences can be computed. If succeed, record the perfect loop + nest in LOOP_NEST; record all data references in DATAREFS and record all + data dependence relations in DDRS. */ + +static bool +prepare_perfect_loop_nest (struct loop *loop, vec *loop_nest, + vec *datarefs, vec *ddrs) +{ + tree niters; + struct loop *start_loop = NULL, *innermost_loop = loop; + + /* Find loop nest from the innermost loop. */ + while (loop->num != 0 && loop->inner == start_loop) + { + if (!proper_loop_form_for_interchange (loop)) + break; + + /* Loop must have determined niters. */ + niters = number_of_latch_executions (loop); + if (!niters || chrec_contains_undetermined (niters)) + break; + + start_loop = loop; + /* If this loop has sibling loop, the father loop won't be in perfect + loop nest. */ + if (loop->next != NULL) + break; + + loop = loop_outer (loop); + } + + start_loop = prune_non_rectangle_loop_nest (innermost_loop, start_loop); + + if (!start_loop || !start_loop->inner) + return false; + + datarefs->create (20); + if (find_data_references_in_loop (start_loop, datarefs) == chrec_dont_know + /* Check if there are too many of data references. */ + || ((int) datarefs->length () + > PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) + /* Check if loop nest should be interchanged. */ + || !should_interchange_loop_nest (start_loop, innermost_loop, *datarefs)) + { + free_data_refs (*datarefs); + return false; + } + + /* Check if data dependences can be computed for loop nest starting from + start_loop. */ + loop = start_loop; + loop_nest->create (3); + do { + ddrs->create (20); + loop_nest->truncate (0); + + /* Strip data references not in loop nest starting from start_loop. */ + if (loop != start_loop) + prune_datarefs_not_in_loop (start_loop, *datarefs); + + if (find_loop_nest (start_loop, loop_nest) + && tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "\nConsider loop interchange for loop_nest<%d - %d>\n", + start_loop->num, innermost_loop->num); + + return true; + } + + free_dependence_relations (*ddrs); + /* Try to compute data dependences with the outermost loop stripped. */ + loop = start_loop; + start_loop = start_loop->inner; + } while (start_loop && start_loop->inner); + + loop_nest->release (); + free_data_refs (*datarefs); + return false; +} + +/* Main entry for loop interchange pass. */ + +unsigned int +pass_linterchange::execute (function *fun) +{ + if (number_of_loops (fun) <= 2) + return 0; + + bool changed_p = false;; + struct loop *loop; + vec loop_nest; + vec datarefs; + vec ddrs; + + FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) + if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs)) + { + tree_loop_interchange loop_interchange (loop_nest, datarefs, ddrs); + changed_p |= loop_interchange.interchange (); + } + + if (changed_p) + scev_reset_htab (); + + return changed_p ? (TODO_update_ssa_only_virtuals) : 0; +} + +} // anon namespace + +gimple_opt_pass * +make_pass_linterchange (gcc::context *ctxt) +{ + return new pass_linterchange (ctxt); +} diff --git a/gcc/tree-ssa-loop-ivcanon.c b/gcc/tree-ssa-loop-ivcanon.c index efb199a..56eff1c 100644 --- a/gcc/tree-ssa-loop-ivcanon.c +++ b/gcc/tree-ssa-loop-ivcanon.c @@ -76,10 +76,13 @@ enum unroll_level }; /* Adds a canonical induction variable to LOOP iterating NITER times. EXIT - is the exit edge whose condition is replaced. */ + is the exit edge whose condition is replaced. The ssa versions of the new + IV before and after increment will be stored in VAR_BEFORE and VAR_AFTER + if they are not NULL. */ -static void -create_canonical_iv (struct loop *loop, edge exit, tree niter) +void +create_canonical_iv (struct loop *loop, edge exit, tree niter, + tree *var_before = NULL, tree *var_after = NULL) { edge in; tree type, var; @@ -112,7 +115,9 @@ create_canonical_iv (struct loop *loop, edge exit, tree niter) create_iv (niter, build_int_cst (type, -1), NULL_TREE, loop, - &incr_at, false, NULL, &var); + &incr_at, false, var_before, &var); + if (var_after) + *var_after = var; cmp = (exit->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR; gimple_cond_set_code (cond, cmp); diff --git a/gcc/tree-ssa-loop-ivopts.h b/gcc/tree-ssa-loop-ivopts.h index f8f31e9..cfa1fbe 100644 --- a/gcc/tree-ssa-loop-ivopts.h +++ b/gcc/tree-ssa-loop-ivopts.h @@ -31,4 +31,6 @@ extern bool expr_invariant_in_loop_p (struct loop *, tree); bool may_be_nonaddressable_p (tree expr); void tree_ssa_iv_optimize (void); +void create_canonical_iv (struct loop *, edge, tree, + tree * = NULL, tree * = NULL); #endif /* GCC_TREE_SSA_LOOP_IVOPTS_H */