From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2102) id 741163858410; Wed, 17 Nov 2021 08:18:05 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 741163858410 Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Frederik Harwath To: gcc-cvs@gcc.gnu.org Subject: [gcc/devel/omp/gcc-11] openacc: Add further kernels tests X-Act-Checkin: gcc X-Git-Author: Frederik Harwath X-Git-Refname: refs/heads/devel/omp/gcc-11 X-Git-Oldrev: dce1c32758dbdba9b09ba4f9aae10b85cb0e8a55 X-Git-Newrev: e0ea0bc003fc931078a1dcfb1097362a9484e9ce Message-Id: <20211117081805.741163858410@sourceware.org> Date: Wed, 17 Nov 2021 08:18:05 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 17 Nov 2021 08:18:05 -0000 https://gcc.gnu.org/g:e0ea0bc003fc931078a1dcfb1097362a9484e9ce commit e0ea0bc003fc931078a1dcfb1097362a9484e9ce Author: Frederik Harwath Date: Tue Nov 16 16:17:15 2021 +0100 openacc: Add further kernels tests Add some copies of tests to continue covering the old "parloops"-based "kernels" implementation - until it gets removed from GCC - and add further tests for the new Graphite-based implementation. libgomp/ChangeLog: * testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90: New test. gcc/testsuite/ChangeLog: * c-c++-common/goacc/classify-kernels-unparallelized-graphite.c: New test. * c-c++-common/goacc/classify-kernels-unparallelized-parloops.c: New test. * c-c++-common/goacc/kernels-decompose-1-parloops.c: New test. * c-c++-common/goacc/kernels-reduction-parloops.c: New test. * c-c++-common/goacc/loop-auto-reductions.c: New test. * c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c: New test. * c-c++-common/goacc/note-parallelism-kernels-loops-1.c: New test. * c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c: New test. * gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95: New test. * gfortran.dg/goacc/kernels-conversion.f95: New test. * gfortran.dg/goacc/kernels-decompose-1-parloops.f95: New test. * gfortran.dg/goacc/kernels-decompose-parloops-2.f95: New test. * gfortran.dg/goacc/kernels-loop-data-parloops-2.f95: New test. * gfortran.dg/goacc/kernels-loop-parloops-2.f95: New test. * gfortran.dg/goacc/kernels-loop-parloops.f95: New test. * gfortran.dg/goacc/kernels-reductions.f90: New test. Diff: --- .../classify-kernels-unparallelized-graphite.c | 41 ++++++ .../classify-kernels-unparallelized-parloops.c | 47 +++++++ .../goacc/kernels-decompose-1-parloops.c | 125 +++++++++++++++++ .../goacc/kernels-reduction-parloops.c | 36 +++++ .../c-c++-common/goacc/loop-auto-reductions.c | 22 +++ ...note-parallelism-1-kernels-loop-auto-parloops.c | 128 +++++++++++++++++ .../goacc/note-parallelism-kernels-loops-1.c | 61 ++++++++ .../note-parallelism-kernels-loops-parloops.c | 53 +++++++ .../classify-kernels-unparallelized-parloops.f95 | 44 ++++++ .../gfortran.dg/goacc/kernels-conversion.f95 | 52 +++++++ .../goacc/kernels-decompose-1-parloops.f95 | 121 ++++++++++++++++ .../goacc/kernels-decompose-parloops-2.f95 | 154 +++++++++++++++++++++ .../goacc/kernels-loop-data-parloops-2.f95 | 52 +++++++ .../gfortran.dg/goacc/kernels-loop-parloops-2.f95 | 45 ++++++ .../gfortran.dg/goacc/kernels-loop-parloops.f95 | 39 ++++++ .../gfortran.dg/goacc/kernels-reductions.f90 | 37 +++++ .../parallel-loop-auto-reduction-2.f90 | 98 +++++++++++++ 17 files changed, 1155 insertions(+) diff --git a/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-graphite.c b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-graphite.c new file mode 100644 index 00000000000..77f4524907a --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-graphite.c @@ -0,0 +1,41 @@ +/* Check offloaded function's attributes and classification for unparallelized + OpenACC 'kernels' with Graphite kernles handling (default). */ + +/* { dg-additional-options "-O2" } + { dg-additional-options "-fno-openacc-kernels-annotate-loops" } + { dg-additional-options "-fopt-info-optimized-omp" } + { dg-additional-options "-fopt-info-note-omp" } + { dg-additional-options "-fdump-tree-ompexp" } + { dg-additional-options "-fdump-tree-graphite-details" } + { dg-additional-options "-fdump-tree-oaccloops1" } + { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */ + +#define N 1024 + +extern unsigned int *__restrict a; +extern unsigned int *__restrict b; +extern unsigned int *__restrict c; + +extern unsigned int f (unsigned int); +#pragma acc routine (f) seq + +void KERNELS () +{ +#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) + for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .Graphite. part in OpenACC .kernels. region" } */ + /* An "extern"al mapping of loop iterations/array indices makes the loop + unparallelizable. */ + c[i] = a[f (i)] + b[f (i)]; /* { dg-optimized "assigned OpenACC seq loop parallelism" } */ +} + +/* Check the offloaded function's attributes. + { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc parallel_kernels_graphite, omp target entrypoint\\)\\)" 1 "ompexp" } } */ + +/* Check that Graphite can handle neither the original nor the offloaded region + { dg-final { scan-tree-dump-times "number of SCoPs: 0" 2 "graphite" } } + +/* Check the offloaded function's classification and compute dimensions (will + always be 1 x 1 x 1 for non-offloading compilation). + { dg-final { scan-tree-dump-times "(?n)Function is parallel_kernels_graphite OpenACC kernels offload" 1 "oaccloops1" } } + { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } } + { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc parallel_kernels_graphite, omp target entrypoint\\)\\)" 1 "oaccloops1" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-parloops.c b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-parloops.c new file mode 100644 index 00000000000..252ab8eb87b --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-parloops.c @@ -0,0 +1,47 @@ +/* Check offloaded function's attributes and classification for unparallelized + OpenACC 'kernels' with "parloops" handling. */ + +/* { dg-additional-options "-O2" } + { dg-additional-options "--param openacc-kernels=decompose-parloops" } + { dg-additional-options "-fno-openacc-kernels-annotate-loops" } + { dg-additional-options "-fopt-info-note-optimized-omp" } + { dg-additional-options "-fdump-tree-ompexp" } + { dg-additional-options "-fdump-tree-parloops1-all" } + { dg-additional-options "-fdump-tree-oaccloops1" } + { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */ + +/* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting + aspects of that functionality. */ + +#define N 1024 + +extern unsigned int *__restrict a; +extern unsigned int *__restrict b; +extern unsigned int *__restrict c; + +extern unsigned int f (unsigned int); +#pragma acc routine (f) seq + +void KERNELS () +{ +#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + /* An "extern"al mapping of loop iterations/array indices makes the loop + unparallelizable. */ + c[i] = a[f (i)] + b[f (i)]; +} + +/* Check the offloaded function's attributes. + { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels, omp target entrypoint\\)\\)" 1 "ompexp" } } */ + +/* Check that exactly one OpenACC kernels construct is analyzed, and that it + can't be parallelized. + { dg-final { scan-tree-dump-times "FAILED:" 1 "parloops1" } } + { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } + { dg-final { scan-tree-dump-not "SUCCESS: may be parallelized" "parloops1" } } */ + +/* Check the offloaded function's classification and compute dimensions (will + always be 1 x 1 x 1 for non-offloading compilation). + { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops1" } } + { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } } + { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops1" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1-parloops.c b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1-parloops.c new file mode 100644 index 00000000000..76d528a6d8e --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1-parloops.c @@ -0,0 +1,125 @@ +/* Test OpenACC .kernels. region decomposition with + "split-parloops" handling. */ +/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */ +/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */ +/* { dg-additional-options "-fopt-info-omp-all" } */ +/* { dg-additional-options "-Wopenacc-parallelism" } */ +/* { dg-additional-options "-O2" } for "parloops". */ + +/* See also "../../gfortran.dg/goacc/kernels-decompose-1.f95". */ + +#pragma acc routine gang +extern int +f_g (int); + +#pragma acc routine worker +extern int +f_w (int); + +#pragma acc routine vector +extern int +f_v (int); + +#pragma acc routine seq +extern int +f_s (int); + +int +main () +{ + int x, y, z; +#define N 10 + int a[N], b[N], c[N]; + +#pragma acc kernels + { + x = 0; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */ + y = x < 10; + z = x++; + ; + } + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */ + for (int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + a[i] = 0; + +#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (int i = 0; i < N; i++) + b[i] = a[N - i - 1]; + +#pragma acc kernels + { +#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (int i = 0; i < N; i++) + b[i] = a[N - i - 1]; + +#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (int i = 0; i < N; i++) + c[i] = a[i] * b[i]; + + a[z] = 0; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */ + +#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (int i = 0; i < N; i++) + c[i] += a[i]; + +#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */ + for (int i = 0 + 1; i < N; i++) + c[i] += c[i - 1]; + } + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */ + { +#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */ + /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */ + for (int i = 0; i < N; ++i) +#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */ + for (int j = 0; j < N; ++j) +#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } */ + for (int k = 0; k < N; ++k) + a[(i + j + k) % N] + = b[j] + + f_v (c[k]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */ + + //TODO Should the following turn into "gang-single" instead of "parloops"? + //TODO The problem is that the first STMT is "if (y <= 4) goto ; else goto ;", thus "parloops". + if (y < 5) +#pragma acc loop independent /* { dg-missed "unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" } */ + for (int j = 0; j < N; ++j) + b[j] = f_w (c[j]); + } + +#pragma acc kernels /* { dg-warning "region contains gang partitioned code but is not gang partitioned" } */ + { + /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" "" { target *-*-* } .+1 } */ + y = f_g (a[5]); /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */ + +#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */ + /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */ + for (int j = 0; j < N; ++j) + b[j] = y + f_w (c[j]); /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */ + } + +#pragma acc kernels + { + y = 3; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */ + +#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */ + /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */ + for (int j = 0; j < N; ++j) + b[j] = y + f_v (c[j]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */ + + z = 2; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */ + } + +#pragma acc kernels /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */ + ; + + return 0; +} diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-reduction-parloops.c b/gcc/testsuite/c-c++-common/goacc/kernels-reduction-parloops.c new file mode 100644 index 00000000000..1449f7a066d --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-reduction-parloops.c @@ -0,0 +1,36 @@ +/* { dg-additional-options "--param=openacc-kernels=parloops" } as this is + specifically testing "parloops" handling. */ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +#include + +#define n 10000 + +unsigned int a[n]; + +void __attribute__((noinline,noclone)) +foo (void) +{ + int i; + unsigned int sum = 1; + +#pragma acc kernels copyin (a[0:n]) copy (sum) + { + for (i = 0; i < n; ++i) + sum += a[i]; + } + + if (sum != 5001) + abort (); +} + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint, noclone, noinline\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-reductions.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-reductions.c new file mode 100644 index 00000000000..4d033ccff2d --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-reductions.c @@ -0,0 +1,22 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-graphite-details" } */ + +#include + +#define n 10000 + +unsigned int a[n]; + +void __attribute__((noinline,noclone)) +foo (void) +{ + int i; + unsigned int sum = 1; + +#pragma acc parallel copyin (a[0:n]) + { +#pragma acc loop auto reduction(+:sum) /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism"} */ + for (i = 0; i < n; ++i) + sum += a[i]; + } +} diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c new file mode 100644 index 00000000000..4889c398c06 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c @@ -0,0 +1,128 @@ +/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels' + construct containing 'loop' constructs with explicit or implicit 'auto' + clause that are handled by "parloops". */ + +/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */ +/* { dg-additional-options "-fopt-info-optimized-omp" } */ +/* { dg-additional-options "-fopt-info-note-omp" } */ + +//TODO update accordingly +/* See also "../../gfortran.dg/goacc/note-parallelism.f90". */ + +int +main () +{ + int x, y, z; + +#pragma acc kernels + /* Strangely indented to keep this similar to other test cases. */ + { +#pragma acc loop + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto gang vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto gang worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto gang worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) +#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + for (y = 0; y < 10; y++) +#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */ + for (z = 0; z < 10; z++) + ; + +#pragma acc loop auto + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) + ; + +#pragma acc loop auto + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) +#pragma acc loop auto + for (y = 0; y < 10; y++) + ; + +#pragma acc loop auto + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) +#pragma acc loop auto + for (y = 0; y < 10; y++) +#pragma acc loop auto + for (z = 0; z < 10; z++) + ; + +#pragma acc loop + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) +#pragma acc loop auto + for (y = 0; y < 10; y++) +#pragma acc loop auto + for (z = 0; z < 10; z++) + ; + +#pragma acc loop auto + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) +#pragma acc loop + for (y = 0; y < 10; y++) +#pragma acc loop auto + for (z = 0; z < 10; z++) + ; + +#pragma acc loop auto + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) +#pragma acc loop auto + for (y = 0; y < 10; y++) +#pragma acc loop + for (z = 0; z < 10; z++) + ; + +#pragma acc loop + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */ + for (x = 0; x < 10; x++) +#pragma acc loop auto + for (y = 0; y < 10; y++) +#pragma acc loop + for (z = 0; z < 10; z++) + ; + } + + return 0; +} diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-1.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-1.c new file mode 100644 index 00000000000..0cd2b9de174 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-1.c @@ -0,0 +1,61 @@ +/* Test the output of "-fopt-info-optimized-omp" for an OpenACC "kernels" + construct containing loops. */ + +/* { dg-additional-options "-fopt-info-optimized-omp" } */ +/* { dg-additional-options "-fopt-info-note-omp" } */ +/* { dg-additional-options "-O2" } */ + +//TODO update accordingly +/* See also "../../gfortran.dg/goacc/note-parallelism.f90". */ + +int +main () +{ + int x, y, z; + +#pragma acc kernels + for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */ + ; + +#pragma acc kernels + for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */ + ; + +#pragma acc kernels + for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */ + for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + ; + +#pragma acc kernels + for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */ + ; + +#pragma acc kernels + for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */ + for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + ; + +#pragma acc kernels + for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */ + for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + ; + +#pragma acc kernels + for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ \ + /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */ + for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + ; + + return 0; +} + +/* { dg-prune-output ".auto. loop cannot be parallel" } */ diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c new file mode 100644 index 00000000000..a3fea483a95 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c @@ -0,0 +1,53 @@ +/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels' + construct containing loops. */ + +/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */ +/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */ +/* { dg-additional-options "-fopt-info-optimized-omp" } */ +/* { dg-additional-options "-fopt-info-note-omp" } */ +/* { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */ +// TODO update accordingly +/* See also "../../gfortran.dg/goacc/note-parallelism.f90". */ + +int +main () +{ + int x, y, z; + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + ; + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + ; + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + for (y = 0; y < 10; y++) + for (z = 0; z < 10; z++) + ; + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + ; + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + for (y = 0; y < 10; y++) + ; + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + for (y = 0; y < 10; y++) + for (z = 0; z < 10; z++) + ; + +#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ + for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */ + for (y = 0; y < 10; y++) + for (z = 0; z < 10; z++) + ; + + return 0; +} diff --git a/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95 b/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95 new file mode 100644 index 00000000000..c9e24449db1 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95 @@ -0,0 +1,44 @@ +! Check offloaded function's attributes and classification for unparallelized +! OpenACC kernels that are handled by "parloops". + +! { dg-additional-options "--param openacc-kernels=decompose-parloops" } +! { dg-additional-options "-O2" } +! { dg-additional-options "-fno-openacc-kernels-annotate-loops" } +! { dg-additional-options "-fopt-info-optimized-note-omp" } +! { dg-additional-options "-fdump-tree-ompexp" } +! { dg-additional-options "-fdump-tree-parloops1-all" } +! { dg-additional-options "-fdump-tree-oaccloops1" } + +program main + implicit none + integer, parameter :: n = 1024 + integer, dimension (0:n-1) :: a, b, c + integer :: i + + ! A function call in a data-reference makes the loop unparallelizable + integer, external :: f + + call setup(a, b) + + !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1)) ! { dg-message "optimized: assigned OpenACC seq loop parallelism" } + do i = 0, n - 1 + ! { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" "" { target *-*-* } .-1 } + c(i) = a(f (i)) + b(f (i)) + end do + !$acc end kernels +end program main + +! Check the offloaded function's attributes. +! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels, omp target entrypoint\\)\\)" 1 "ompexp" } } + +! Check that exactly one OpenACC kernels construct is analyzed, and that it +! can't be parallelized. +! { dg-final { scan-tree-dump-times "FAILED:" 1 "parloops1" } } +! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } +! { dg-final { scan-tree-dump-not "SUCCESS: may be parallelized" "parloops1" } } + +! Check the offloaded function's classification and compute dimensions (will +! always be 1 x 1 x 1 for non-offloading compilation). +! { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops1" } } +! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } } +! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops1" } } diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 new file mode 100644 index 00000000000..fe287c38c38 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 @@ -0,0 +1,52 @@ +! { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" } + +program main + implicit none + integer, parameter :: N = 1024 + integer, dimension (1:N) :: a + integer :: i, sum + + !$acc kernels copyin(a(1:N)) copy(sum) + + ! converted to "oacc_kernels" + !$acc loop + do i = 1, N + sum = sum + a(i) + end do + + ! converted to "oacc_parallel_kernels_gang_single" + sum = sum + 1 + a(1) = a(1) + 1 + + ! converted to "oacc_parallel_kernels_parallelized" + !$acc loop independent + do i = 1, N + sum = sum + a(i) + end do + + ! converted to "oacc_kernels" + if (sum .gt. 10) then + !$acc loop + do i = 1, N + sum = sum + a(i) + end do + end if + + ! converted to "oacc_kernels" + !$acc loop auto + do i = 1, N + sum = sum + a(i) + end do + + !$acc end kernels +end program main + +! Check that the kernels region is split into a data region and enclosed +! parallel regions. +! { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "omp_oacc_kernels_decompose" } } +! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite " 5 "omp_oacc_kernels_decompose" } } + +! Each of the parallel regions is async, and there is a final call to +! __builtin_GOACC_wait. +! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite async\\(-1\\)" 5 "omp_oacc_kernels_decompose" } } +! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "omp_oacc_kernels_decompose" } } diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1-parloops.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1-parloops.f95 new file mode 100644 index 00000000000..3ecf84da836 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1-parloops.f95 @@ -0,0 +1,121 @@ +! Test OpenACC 'kernels' construct decomposition with "decompose-parloops" +! handling + +! { dg-additional-options "--param openacc-kernels=decompose-parloops" } +! { dg-additional-options "-fopt-info-optimized-omp" } +! { dg-additional-options "-Wopenacc-parallelism" } +! { dg-additional-options "-O2" } for "parloops". + +! See also "../../c-c++-common/goacc/kernels-decompose-1.c". + +program main + implicit none + + integer, external :: f_g + !$acc routine (f_g) gang + integer, external :: f_w + !$acc routine (f_w) worker + integer, external :: f_v + !$acc routine (f_v) vector + integer, external :: f_s + !$acc routine (f_s) seq + + integer :: i, j, k + integer :: x, y, z + logical :: y_l + integer, parameter :: N = 10 + integer :: a(N), b(N), c(N) + + !$acc kernels + x = 0 + y = 0 + y_l = x < 10 + z = x + x = x + 1 + !$acc end kernels + + !$acc kernels + do i = 1, N + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } .-1 } + a(i) = 0 + end do + !$acc end kernels + + !$acc kernels loop ! { dg-optimized "assigned OpenACC gang loop parallelism" } + do i = 1, N + b(i) = a(N - i + 1) + end do + + !$acc kernels + !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" } + do i = 1, N + b(i) = a(N - i + 1) + end do + + !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" } + do i = 1, N + c(i) = a(i) * b(i) + end do + + a(z) = 0 + + !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" } + do i = 1, N + c(i) = c(i) + a(i) + end do + + !$acc loop seq ! { dg-optimized "assigned OpenACC seq loop parallelism" } + do i = 1 + 1, N + c(i) = c(i) + c(i - 1) + end do + !$acc end kernels + + !$acc kernels ! { dg-optimized "assigned OpenACC worker vector loop parallelism" } + !$acc loop independent ! { dg-optimized "assigned OpenACC gang loop parallelism" } + do i = 1, N + !$acc loop independent ! { dg-optimized "assigned OpenACC worker loop parallelism" } + do j = 1, N + !$acc loop independent ! { dg-optimized "assigned OpenACC seq loop parallelism" } + ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } + ! { dg-bogus "optimized: assigned OpenACC vector loop parallelism" "" { target *-*-* } .-2 } + do k = 1, N + a(1 + mod(i + j + k, N)) & + = b(j) & + + f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" } + end do + end do + end do + + !TODO Should the following turn into "gang-single" instead of "parloops"? + !TODO The problem is that the first STMT is "if (y <= 4) goto ; else goto ;", thus "parloops". + if (y < 5) then + !$acc loop independent + do j = 1, N + b(j) = f_w (c(j)) + end do + end if + !$acc end kernels + + !$acc kernels ! { dg-warning "region contains gang partitioned code but is not gang partitioned" } + y = f_g (a(5)) ! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" } + + !$acc loop independent ! { dg-optimized "assigned OpenACC gang loop parallelism" } + do j = 1, N + b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" } + end do + !$acc end kernels + + !$acc kernels + y = 3 + + !$acc loop independent ! { dg-optimized "assigned OpenACC gang worker loop parallelism" } + do j = 1, N + b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" } + end do + + z = 2 + !$acc end kernels + + !$acc kernels + !$acc end kernels +end program main diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-parloops-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-parloops-2.f95 new file mode 100644 index 00000000000..fc126ea5e03 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-parloops-2.f95 @@ -0,0 +1,154 @@ +! Test OpenACC 'kernels' construct decomposition. + +! { dg-additional-options "-fopenacc-kernels-annotate-loops" } +! { dg-additional-options "-fopt-info-omp-all" } +! { dg-additional-options "--param=openacc-kernels=decompose-parloops" } +! { dg-additional-options "-O2" } for 'parloops'. + +! { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting +! aspects of that functionality. + +! See also '../../c-c++-common/goacc/kernels-decompose-2.c'. + +! It's only with Tcl 8.5 (released in 2007) that "the variable 'varName' +! passed to 'incr' may be unset, and in that case, it will be set to [...]", +! so to maintain compatibility with earlier Tcl releases, we manually +! initialize counter variables: +! { dg-line l_dummy[variable c_loop_i 0 c_loop_j 0 c_loop_k 0 c_part 0] } +! { dg-message "dummy" "" { target iN-VAl-Id } l_dummy } to avoid +! "WARNING: dg-line var l_dummy defined, but not used". + +program main + implicit none + + integer, external :: f_g + !$acc routine (f_g) gang + integer, external :: f_w + !$acc routine (f_w) worker + integer, external :: f_v + !$acc routine (f_v) vector + integer, external :: f_s + !$acc routine (f_s) seq + + integer :: i, j, k + integer :: x, y, z + logical :: y_l + integer, parameter :: N = 10 + integer :: a(N), b(N), c(N) + + !$acc kernels + x = 0 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } + y = 0 + y_l = x < 10 + z = x + x = x + 1 + ; + !$acc end kernels + + !$acc kernels + do i = 1, N ! { dg-line l_loop_i[incr c_loop_i] } + ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } + a(i) = 0 + end do + !$acc end kernels + + !$acc kernels loop ! { dg-line l_loop_i[incr c_loop_i] } + ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } + do i = 1, N + b(i) = a(N - i + 1) + end do + + !$acc kernels + !$acc loop ! { dg-line l_loop_i[incr c_loop_i] } + ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } + do i = 1, N + b(i) = a(N - i + 1) + end do + + !$acc loop ! { dg-line l_loop_i[incr c_loop_i] } + ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } + do i = 1, N + c(i) = a(i) * b(i) + end do + + a(z) = 0 + + !$acc loop ! { dg-line l_loop_i[incr c_loop_i] } + ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } + do i = 1, N + c(i) = c(i) + a(i) + end do + + !$acc loop seq ! { dg-line l_loop_i[incr c_loop_i] } + ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } + ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } + do i = 1 + 1, N + c(i) = c(i) + c(i - 1) + end do + !$acc end kernels + + !$acc kernels ! { dg-optimized "assigned OpenACC worker vector loop parallelism" } + !$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] } + ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } + do i = 1, N + !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] } + ! { dg-optimized "assigned OpenACC worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } + do j = 1, N + !$acc loop independent ! { dg-line l_loop_k[incr c_loop_k] } + ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } l_loop_k$c_loop_k } + ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_k$c_loop_k } + do k = 1, N + a(1 + mod(i + j + k, N)) & + = b(j) & + + f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" } + end do + end do + end do + + !TODO Should the following turn into "gang-single" instead of "parloops"? + !TODO The problem is that the first STMT is 'if (y <= 4) goto ; else goto ;', thus "parloops". + if (y < 5) then ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } + !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] } + ! { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_j$c_loop_j } + do j = 1, N + b(j) = f_w (c(j)) + end do + end if + !$acc end kernels + + !$acc kernels + ! { dg-bogus "\[Ww\]arning: region contains gang partitioned code but is not gang partitioned" "TODO 'kernels'" { xfail *-*-* } .-1 } + y = f_g (a(5)) ! { dg-line l_part[incr c_part] } + !TODO If such a construct is placed in its own part (like it is, here), can't this actually use gang paralelism, instead of "gang-single"? + ! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" "" { target *-*-* } l_part$c_part } + + !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] } + ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j } + ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } + do j = 1, N + b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" } + end do + !$acc end kernels + + !$acc kernels + y = 3 + + !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] } + ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j } + ! { dg-optimized "assigned OpenACC gang worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } + do j = 1, N + b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" } + end do + + z = 2 + !$acc end kernels + + !$acc kernels + !$acc end kernels +end program main diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-parloops-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-parloops-2.f95 new file mode 100644 index 00000000000..c92ad4ccf6f --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-parloops-2.f95 @@ -0,0 +1,52 @@ +! { dg-additional-options "--param=openacc-kernels=decompose-parloops" } as this is +! specifically testing "parloops" handling. +! { dg-additional-options "-O2" } +! { dg-additional-options "-fopenacc-kernels-annotate-loops" } +! { dg-additional-options "-fdump-tree-parloops1-all" } +! { dg-additional-options "-fdump-tree-optimized" } + +program main + implicit none + integer, parameter :: n = 1024 + integer, dimension (0:n-1) :: a, b, c + integer :: i, ii + + !$acc data copyout (a(0:n-1)) + !$acc kernels present (a(0:n-1)) + do i = 0, n - 1 + a(i) = i * 2 + end do + !$acc end kernels + !$acc end data + + !$acc data copyout (b(0:n-1)) + !$acc kernels present (b(0:n-1)) + do i = 0, n -1 + b(i) = i * 4 + end do + !$acc end kernels + !$acc end data + + !$acc data copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1)) + !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1)) + do ii = 0, n - 1 + c(ii) = a(ii) + b(ii) + end do + !$acc end kernels + !$acc end data + + do i = 0, n - 1 + if (c(i) .ne. a(i) + b(i)) STOP 1 + end do + +end program main + +! Check that only three loops are analyzed, and that all can be parallelized. +! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } +! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } +! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } + +! Check that the loop has been split off into a function. +! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } } +! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } } +! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } } diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops-2.f95 new file mode 100644 index 00000000000..634445ad4a1 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops-2.f95 @@ -0,0 +1,45 @@ +! { dg-additional-options "--param openacc-kernels=decompose-parloops" } as this is +! specifically testing "parloops" handling. +! { dg-additional-options "-O2" } +! { dg-additional-options "-fdump-tree-parloops1-all" } +! { dg-additional-options "-fdump-tree-optimized" } + +program main + implicit none + integer, parameter :: n = 1024 + integer, dimension (0:n-1) :: a, b, c + integer :: i, ii + + !$acc kernels copyout (a(0:n-1)) + do i = 0, n - 1 + a(i) = i * 2 + end do + !$acc end kernels + + !$acc kernels copyout (b(0:n-1)) + do i = 0, n -1 + b(i) = i * 4 + end do + !$acc end kernels + + !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1)) + do ii = 0, n - 1 + c(ii) = a(ii) + b(ii) + end do + !$acc end kernels + + do i = 0, n - 1 + if (c(i) .ne. a(i) + b(i)) STOP 1 + end do + +end program main + +! Check that only three loops are analyzed, and that all can be parallelized. +! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } +! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } +! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } + +! Check that the loop has been split off into a function. +! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } } +! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } } +! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } } diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops.f95 new file mode 100644 index 00000000000..c6fa14f5920 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops.f95 @@ -0,0 +1,39 @@ +! { dg-additional-options "--param openacc-kernels=decompose-parloops" } as this is +! specifically testing "parloops" handling. +! { dg-additional-options "-O2" } +! { dg-additional-options "-fdump-tree-parloops1-all" } +! { dg-additional-options "-fdump-tree-optimized" } + +program main + implicit none + integer, parameter :: n = 1024 + integer, dimension (0:n-1) :: a, b, c + integer :: i, ii + + do i = 0, n - 1 + a(i) = i * 2 + end do + + do i = 0, n -1 + b(i) = i * 4 + end do + + !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1)) + do ii = 0, n - 1 + c(ii) = a(ii) + b(ii) + end do + !$acc end kernels + + do i = 0, n - 1 + if (c(i) .ne. a(i) + b(i)) STOP 1 + end do + +end program main + +! Check that only one loop is analyzed, and that it can be parallelized. +! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } +! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } +! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } + +! Check that the loop has been split off into a function. +! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } } diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90 b/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90 new file mode 100644 index 00000000000..2036395bf59 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90 @@ -0,0 +1,37 @@ +! { dg-additional-options "--param openacc-kernels=decompose" } + +! A regression test checking that the reduction clause lowering does +! not fail if a subroutine argument is used as a reduction variable in +! a kernels region. + +! This was fine ... +subroutine reduction_var_not_argument(res) + real res + real tmp + integer i + + !$acc kernels + !$acc loop reduction(+:tmp) + do i=0,n-1 + tmp = tmp + 1 + end do + !$acc end kernels + + res = tmp +end subroutine reduction_var_not_argument + +! ... but this led to problems because ARG +! was a pointer type that did not get dereferenced. +subroutine reduction_var_as_argument(arg) + real arg + integer i + + !$acc kernels + !$acc loop reduction(+:arg) + do i=0,n-1 + arg = arg + 1 + end do + !$acc end kernels +end subroutine reduction_var_as_argument + + diff --git a/libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90 new file mode 100644 index 00000000000..0e9da426d99 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90 @@ -0,0 +1,98 @@ +! Check that the Graphite-based "auto" loop and "kernels" handling +! is able to assign the parallelism dimensions correctly for a simple +! loop-nest with reductions. All loops should be parallelized. + +! { dg-additional-options "-O2 -g" } +! { dg-additional-options "-foffload=-fdump-tree-oaccloops1-details" } +! { dg-additional-options "-foffload=-fopt-info-optimized" } +! { dg-additional-options "-fdump-tree-oaccloops1-details" } +! { dg-additional-options "-fopt-info-optimized" } + +module test + implicit none + + integer, parameter :: n = 10000 + integer :: a(n,n) + integer :: sums(n,n) + +contains + function sum_loop_auto() result(sum) + integer :: i, j + integer :: sum, max_val + + sum = 0 + max_val = 0 + + !$acc parallel copyin (a) reduction(+:sum) + !$acc loop auto reduction(+:sum) reduction(max:max_val) ! { dg-optimized "assigned OpenACC gang worker loop parallelism" } + ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 } + do i = 1,size (a, 1) + !$acc loop auto reduction(max:max_val) ! { dg-optimized "assigned OpenACC vector loop parallelism" } + ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 } + do j = 1,size(a, 2) + max_val = a(i,j) + end do + sum = sum + max_val + end do + !$acc end parallel + end function sum_loop_auto + + function sum_kernels() result(sum) + integer :: i, j + integer :: sum, max_val + + sum = 0 + max_val = 0 + + !$acc kernels + ! { dg-optimized {'map\(force_tofrom:max_val [^)]+\)' optimized to 'map\(to:max_val [^)]+\)'} "" { target *-*-* } .-1 } + !$acc loop reduction(+:sum) reduction(max:max_val) ! { dg-optimized "assigned OpenACC gang worker loop parallelism" } + ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 } + ! { dg-optimized "forwarded loop nest in OpenACC .kernels. construct to .Graphite." "" { target *-*-* } .-2 } + do i = 1,size (a, 1) + !$acc loop reduction(max:max_val) ! { dg-optimized "assigned OpenACC vector loop parallelism" } + ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 } + do j = 1,size(a, 2) + max_val = a(i,j) + end do + sum = sum + max_val + end do + !$acc end kernels + end function sum_kernels +end module test + +program main + use test + + implicit none + + integer :: result, i, j + + ! We sum the maxima of n rows, each containing numbers + ! 1..n + integer, parameter :: expected_sum = n * n + + do i = 1, size (a, 1) ! { dg-optimized "loop nest optimized" } + do j = 1, size (a, 2) + a(i, j) = j + end do + end do + + + result = sum_loop_auto() + if (result /= expected_sum) then + write (*, *) "Wrong result:", result + call abort() + endif + + result = sum_kernels() + if (result /= expected_sum) then + write (*, *) "Wrong result:", result + call abort() + endif +end program main + +! This ensures that the dg-optimized assertions above hold for both +! compilers because the output goes to stderr and the dump file. +! { dg-final { scan-offload-tree-dump-times "optimized: assigned OpenACC .*? parallelism" 4 "oaccloops1" } } +! { dg-final { scan-tree-dump-times "optimized: assigned OpenACC .*? parallelism" 4 "oaccloops1" } }