From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 17138 invoked by alias); 19 May 2017 11:03:13 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Received: (qmail 17111 invoked by uid 89); 19 May 2017 11:03:11 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-24.5 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,RCVD_IN_DNSWL_NONE,SPF_PASS,URIBL_RED autolearn=ham version=3.3.2 spammy=1524 X-HELO: relay1.mentorg.com Received: from relay1.mentorg.com (HELO relay1.mentorg.com) (192.94.38.131) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 19 May 2017 11:03:08 +0000 Received: from nat-ies.mentorg.com ([192.94.31.2] helo=svr-ies-mbx-01.mgc.mentorg.com) by relay1.mentorg.com with esmtp id 1dBfgb-0005fQ-7E from Thomas_Schwinge@mentor.com ; Fri, 19 May 2017 04:03:09 -0700 Received: from hertz.schwinge.homeip.net (137.202.0.87) by svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) with Microsoft SMTP Server (TLS) id 15.0.1210.3; Fri, 19 May 2017 12:03:05 +0100 From: Thomas Schwinge To: , Jakub Jelinek Subject: Re: Runtime checking of OpenACC parallelism dimensions clauses In-Reply-To: <87ziej37kq.fsf@hertz.schwinge.homeip.net> References: <87ziej37kq.fsf@hertz.schwinge.homeip.net> User-Agent: Notmuch/0.9-101-g81dad07 (http://notmuchmail.org) Emacs/24.5.1 (x86_64-pc-linux-gnu) Date: Fri, 19 May 2017 11:03:00 -0000 Message-ID: <871srl153s.fsf@hertz.schwinge.homeip.net> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-ClientProxiedBy: svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) To svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) X-SW-Source: 2017-05/txt/msg01549.txt.bz2 Hi! Ping. On Thu, 11 May 2017 14:24:05 +0200, I wrote: > OK for trunk? >=20 > commit 0ba48b4faf85420fbe12971afdd6e0afe70778bb > Author: Thomas Schwinge > Date: Fri May 5 16:41:59 2017 +0200 >=20 > Runtime checking of OpenACC parallelism dimensions clauses >=20=20=20=20=20 > libgomp/ > * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrit= e. > * testsuite/lib/libgomp.exp > (check_effective_target_openacc_nvidia_accel_configured): New > proc. > * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_= c) > (check_effective_target_c++): New procs. > * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c) > (check_effective_target_c++): Likewise. > --- > libgomp/testsuite/lib/libgomp.exp | 12 + > libgomp/testsuite/libgomp.oacc-c++/c++.exp | 7 + > .../libgomp.oacc-c-c++-common/parallel-dims.c | 523 +++++++++++++++= +++++- > libgomp/testsuite/libgomp.oacc-c/c.exp | 7 + > 4 files changed, 537 insertions(+), 12 deletions(-) >=20 > diff --git libgomp/testsuite/lib/libgomp.exp libgomp/testsuite/lib/libgom= p.exp > index 5e47872..62ee2e3 100644 > --- libgomp/testsuite/lib/libgomp.exp > +++ libgomp/testsuite/lib/libgomp.exp > @@ -358,6 +358,18 @@ proc check_effective_target_offload_device_shared_as= { } { > } ] > } >=20=20 > +# Return 1 if configured for nvptx offloading. > + > +proc check_effective_target_openacc_nvidia_accel_configured { } { > + global offload_targets > + if { ![string match "*,nvptx,*" ",$offload_targets,"] } { > + return 0 > + } > + # PR libgomp/65099: Currently, we only support offloading in 64-bit > + # configurations. > + return [is-effective-target lp64] > +} > + > # Return 1 if at least one nvidia board is present. >=20=20 > proc check_effective_target_openacc_nvidia_accel_present { } { > diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp libgomp/testsuite/l= ibgomp.oacc-c++/c++.exp > index 608b298..9beadd6 100644 > --- libgomp/testsuite/libgomp.oacc-c++/c++.exp > +++ libgomp/testsuite/libgomp.oacc-c++/c++.exp > @@ -4,6 +4,13 @@ load_lib libgomp-dg.exp > load_gcc_lib gcc-dg.exp > load_gcc_lib torture-options.exp >=20=20 > +proc check_effective_target_c { } { > + return 0 > +} > +proc check_effective_target_c++ { } { > + return 1 > +} > + > global shlib_ext >=20=20 > set shlib_ext [get_shlib_extension] > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c li= bgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c > index f5766a4..d8af546 100644 > --- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c > @@ -1,25 +1,524 @@ > -/* { dg-do run { target openacc_nvidia_accel_selected } } */ > +/* OpenACC parallelism dimensions clauses: num_gangs, num_workers, > + vector_length. */ > + > +#include > +#include > + > +/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wra= pper > + not behaving as expected for -O0. */ > +#pragma acc routine seq > +static unsigned int __attribute__ ((optimize ("O2"))) acc_gang () > +{ > + if (acc_on_device ((int) acc_device_host)) > + return 0; > + else if (acc_on_device ((int) acc_device_nvidia)) > + { > + unsigned int r; > + asm volatile ("mov.u32 %0,%%ctaid.x;" : "=3Dr" (r)); > + return r; > + } > + else > + __builtin_abort (); > +} > + > +#pragma acc routine seq > +static unsigned int __attribute__ ((optimize ("O2"))) acc_worker () > +{ > + if (acc_on_device ((int) acc_device_host)) > + return 0; > + else if (acc_on_device ((int) acc_device_nvidia)) > + { > + unsigned int r; > + asm volatile ("mov.u32 %0,%%tid.y;" : "=3Dr" (r)); > + return r; > + } > + else > + __builtin_abort (); > +} > + > +#pragma acc routine seq > +static unsigned int __attribute__ ((optimize ("O2"))) acc_vector () > +{ > + if (acc_on_device ((int) acc_device_host)) > + return 0; > + else if (acc_on_device ((int) acc_device_nvidia)) > + { > + unsigned int r; > + asm volatile ("mov.u32 %0,%%tid.x;" : "=3Dr" (r)); > + return r; > + } > + else > + __builtin_abort (); > +} >=20=20 > -/* Worker and vector size checks. Picked an outrageously large > - value. */ >=20=20 > int main () > { > - int dummy[10]; > + acc_init (acc_device_default); >=20=20 > -#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_worker= s" } */ > + /* Non-positive value. */ > + > + /* GR, WS, VS. */ > + { > +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" = { target c } } */ > + int gangs_actual =3D GANGS; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (gangs_actual) \ > + reduction (min: gangs_min, workers_min, vectors_min) reduction (max: g= angs_max, workers_max, vectors_max) \ > + num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive"= "" { target c++ } } */ > + { > + /* We're actually executing with num_gangs (1). */ > + gangs_actual =3D 1; > + for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i) > + { > + /* . */ > +#if 0 > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > +#else > + int gangs =3D acc_gang (); > + gangs_min =3D (gangs_min < gangs) ? gangs_min : gangs; > + gangs_max =3D (gangs_max > gangs) ? gangs_max : gangs; > + int workers =3D acc_worker (); > + workers_min =3D (workers_min < workers) ? workers_min : workers; > + workers_max =3D (workers_max > workers) ? workers_max : workers; > + int vectors =3D acc_vector (); > + vectors_min =3D (vectors_min < vectors) ? vectors_min : vectors; > + vectors_max =3D (vectors_max > vectors) ? vectors_max : vectors; > +#endif > + } > + } > + if (gangs_actual !=3D 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1 > + || workers_min !=3D 0 || workers_max !=3D 0 > + || vectors_min !=3D 0 || vectors_max !=3D 0) > + __builtin_abort (); > +#undef GANGS > + } > + > + /* GP, WS, VS. */ > + { > +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" = { target c } } */ > + int gangs_actual =3D GANGS; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (gangs_actual) \ > + num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive"= "" { target c++ } } */ > + { > + /* We're actually executing with num_gangs (1). */ > + gangs_actual =3D 1; > +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_mi= n) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (gangs_actual !=3D 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1 > + || workers_min !=3D 0 || workers_max !=3D 0 > + || vectors_min !=3D 0 || vectors_max !=3D 0) > + __builtin_abort (); > +#undef GANGS > + } > + > + /* GR, WP, VS. */ > + { > +#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive"= "" { target c } } */ > + int workers_actual =3D WORKERS; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (workers_actual) \ > + num_workers (WORKERS) /* { dg-warning "'num_workers' value must be pos= itive" "" { target c++ } } */ > + { > + /* We're actually executing with num_workers (1). */ > + workers_actual =3D 1; > +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * workers_actual; i > -100 * workers_actual; --= i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (workers_actual !=3D 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D 0 > + || workers_min !=3D 0 || workers_max !=3D workers_actual - 1 > + || vectors_min !=3D 0 || vectors_max !=3D 0) > + __builtin_abort (); > +#undef WORKERS > + } > + > + /* GR, WS, VP. */ > + { > +#define VECTORS 0 /* { dg-warning "'vector_length' value must be positiv= e" "" { target c } } */ > + int vectors_actual =3D VECTORS; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector= _length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured }= } */ \ > + vector_length (VECTORS) /* { dg-warning "'vector_length' value must be= positive" "" { target c++ } } */ > + { > + /* We're actually executing with vector_length (1), just the GCC n= vptx > + back end enforces vector_length (32). */ > + if (acc_on_device (acc_device_nvidia)) > + vectors_actual =3D 32; > + else > + vectors_actual =3D 1; > +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * vectors_actual; i > -100 * vectors_actual; --= i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (acc_get_device_type () =3D=3D acc_device_nvidia) > + { > + if (vectors_actual !=3D 32) > + __builtin_abort (); > + } > + else > + if (vectors_actual !=3D 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D 0 > + || workers_min !=3D 0 || workers_max !=3D 0 > + || vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1) > + __builtin_abort (); > +#undef VECTORS > + } > + > + > + /* High value. */ > +=20=20 > + /* GR, WS, VS. */ > + { > + /* There is no actual limit for the number of gangs, so we try with a > + rather high value. */ > + int gangs =3D 12345; > + int gangs_actual =3D gangs; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (gangs_actual) \ > + reduction (min: gangs_min, workers_min, vectors_min) reduction (max: g= angs_max, workers_max, vectors_max) \ > + num_gangs (gangs) > + { > + if (acc_on_device (acc_device_host)) > + { > + /* We're actually executing with num_gangs (1). */ > + gangs_actual =3D 1; > + } > + /* As we're executing GR not GP, don't multiply with a "gangs_actu= al" > + factor. */ > + for (int i =3D 100 /* * gangs_actual */; i > -100 /* * gangs_actua= l */; --i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (gangs_actual < 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1 > + || workers_min !=3D 0 || workers_max !=3D 0 > + || vectors_min !=3D 0 || vectors_max !=3D 0) > + __builtin_abort (); > + } > + > + /* GP, WS, VS. */ > + { > + /* There is no actual limit for the number of gangs, so we try with a > + rather high value. */ > + int gangs =3D 12345; > + int gangs_actual =3D gangs; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (gangs_actual) \ > + num_gangs (gangs) > + { > + if (acc_on_device (acc_device_host)) > + { > + /* We're actually executing with num_gangs (1). */ > + gangs_actual =3D 1; > + } > +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_mi= n) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (gangs_actual < 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1 > + || workers_min !=3D 0 || workers_max !=3D 0 > + || vectors_min !=3D 0 || vectors_max !=3D 0) > + __builtin_abort (); > + } > + > + /* GR, WP, VS. */ > + { > + /* We try with an outrageously large value. */ > +#define WORKERS 2 << 20 > + int workers_actual =3D WORKERS; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_wo= rkers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configur= ed } } */ \ > + num_workers (WORKERS) > + { > + if (acc_on_device (acc_device_host)) > + { > + /* We're actually executing with num_workers (1). */ > + workers_actual =3D 1; > + } > + else if (acc_on_device (acc_device_nvidia)) > + { > + /* The GCC nvptx back end enforces num_workers (32). */ > + workers_actual =3D 32; > + } > + else > + __builtin_abort (); > +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * workers_actual; i > -100 * workers_actual; --= i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (workers_actual < 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D 0 > + || workers_min !=3D 0 || workers_max !=3D workers_actual - 1 > + || vectors_min !=3D 0 || vectors_max !=3D 0) > + __builtin_abort (); > +#undef WORKERS > + } > + > + /* GR, WP, VS. */ > + { > + /* We try with an outrageously large value. */ > + int workers =3D 2 << 20; > + /* For nvptx offloading, this one will not result in "using num_work= ers > + (32), ignoring runtime setting", and will in fact try to launch w= ith > + "num_workers (workers)", which will run into "libgomp: cuLaunchKe= rnel > + error: invalid argument". So, limit ourselves here. */ > + if (acc_get_device_type () =3D=3D acc_device_nvidia) > + workers =3D 32; > + int workers_actual =3D workers; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (workers_actual) \ > + num_workers (workers) > + { > + if (acc_on_device (acc_device_host)) > + { > + /* We're actually executing with num_workers (1). */ > + workers_actual =3D 1; > + } > + else if (acc_on_device (acc_device_nvidia)) > + { > + /* We're actually executing with num_workers (32). */ > + /* workers_actual =3D 32; */ > + } > + else > + __builtin_abort (); > +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * workers_actual; i > -100 * workers_actual; --= i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (workers_actual < 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D 0 > + || workers_min !=3D 0 || workers_max !=3D workers_actual - 1 > + || vectors_min !=3D 0 || vectors_max !=3D 0) > + __builtin_abort (); > + } > + > + /* GR, WS, VP. */ > { > -#pragma acc loop worker > - for (int i =3D 0; i < 10; i++) > - dummy[i] =3D i; > + /* We try with an outrageously large value. */ > +#define VECTORS 2 << 20 > + int vectors_actual =3D VECTORS; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector= _length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_config= ured } } */ \ > + vector_length (VECTORS) > + { > + if (acc_on_device (acc_device_host)) > + { > + /* We're actually executing with vector_length (1). */ > + vectors_actual =3D 1; > + } > + else if (acc_on_device (acc_device_nvidia)) > + { > + /* The GCC nvptx back end enforces vector_length (32). */ > + vectors_actual =3D 32; > + } > + else > + __builtin_abort (); > +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * vectors_actual; i > -100 * vectors_actual; --= i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (vectors_actual < 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D 0 > + || workers_min !=3D 0 || workers_max !=3D 0 > + || vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1) > + __builtin_abort (); > +#undef VECTORS > } >=20=20 > -#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_l= ength" } */ > + /* GR, WS, VP. */ > { > -#pragma acc loop vector > - for (int i =3D 0; i < 10; i++) > - dummy[i] =3D i; > + /* We try with an outrageously large value. */ > + int vectors =3D 2 << 20; > + int vectors_actual =3D vectors; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector= _length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_acce= l_configured } } */ \ > + vector_length (vectors) > + { > + if (acc_on_device (acc_device_host)) > + { > + /* We're actually executing with vector_length (1). */ > + vectors_actual =3D 1; > + } > + else if (acc_on_device (acc_device_nvidia)) > + { > + /* The GCC nvptx back end enforces vector_length (32). */ > + vectors_actual =3D 32; > + } > + else > + __builtin_abort (); > +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * vectors_actual; i > -100 * vectors_actual; --= i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (vectors_actual < 1) > + __builtin_abort (); > + if (gangs_min !=3D 0 || gangs_max !=3D 0 > + || workers_min !=3D 0 || workers_max !=3D 0 > + || vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1) > + __builtin_abort (); > } >=20=20 > + > + /* Composition of GP, WP, VP. */ > + { > + int gangs =3D 12345; > + /* With nvptx offloading, multi-level reductions apparently are very= slow > + in the following case. So, limit ourselves here. */ > + if (acc_get_device_type () =3D=3D acc_device_nvidia) > + gangs =3D 3; > + int gangs_actual =3D gangs; > +#define WORKERS 3 > + int workers_actual =3D WORKERS; > +#define VECTORS 11 > + int vectors_actual =3D VECTORS; > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual)= /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target op= enacc_nvidia_accel_configured } } */ \ > + num_gangs (gangs) \ > + num_workers (WORKERS) \ > + vector_length (VECTORS) > + { > + if (acc_on_device (acc_device_host)) > + { > + /* We're actually executing with num_gangs (1), num_workers (1), > + vector_length (1). */ > + gangs_actual =3D 1; > + workers_actual =3D 1; > + vectors_actual =3D 1; > + } > + else if (acc_on_device (acc_device_nvidia)) > + { > + /* The GCC nvptx back end enforces vector_length (32). */ > + vectors_actual =3D 32; > + } > + else > + __builtin_abort (); > +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_mi= n) reduction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i) > +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int j =3D 100 * workers_actual; j > -100 * workers_actual; --j) > +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_= min) reduction (max: gangs_max, workers_max, vectors_max) > + for (int k =3D 100 * vectors_actual; k > -100 * vectors_actual; --k) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1 > + || workers_min !=3D 0 || workers_max !=3D workers_actual - 1 > + || vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1) > + __builtin_abort (); > +#undef VECTORS > +#undef WORKERS > + } > + > + > + /* We can't test parallelized OpenACC kernels constructs in this way: = use of > + the acc_gang, acc_worker, acc_vector functions will make the constr= uct > + unparallelizable. */ > + > + > + /* Unparallelized OpenACC kernels constructs must get launched as 1 x = 1 x 1 > + kernels. */ > + { > + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec= tors_max; > + gangs_min =3D workers_min =3D vectors_min =3D INT_MAX; > + gangs_max =3D workers_max =3D vectors_max =3D INT_MIN; > +#pragma acc kernels > + { > + /* This is to make the OpenACC kernels construct unparallelizable.= */ > + asm volatile ("" : : : "memory"); > + > +#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) re= duction (max: gangs_max, workers_max, vectors_max) > + for (int i =3D 100; i > -100; --i) > + { > + gangs_min =3D gangs_max =3D acc_gang (); > + workers_min =3D workers_max =3D acc_worker (); > + vectors_min =3D vectors_max =3D acc_vector (); > + } > + } > + if (gangs_min !=3D 0 || gangs_max !=3D 1 - 1 > + || workers_min !=3D 0 || workers_max !=3D 1 - 1 > + || vectors_min !=3D 0 || vectors_max !=3D 1 - 1) > + __builtin_abort (); > + } > + > + > return 0; > } > diff --git libgomp/testsuite/libgomp.oacc-c/c.exp libgomp/testsuite/libgo= mp.oacc-c/c.exp > index b509a10..4475bf5 100644 > --- libgomp/testsuite/libgomp.oacc-c/c.exp > +++ libgomp/testsuite/libgomp.oacc-c/c.exp > @@ -15,6 +15,13 @@ load_lib libgomp-dg.exp > load_gcc_lib gcc-dg.exp > load_gcc_lib torture-options.exp >=20=20 > +proc check_effective_target_c { } { > + return 1 > +} > +proc check_effective_target_c++ { } { > + return 0 > +} > + > # Initialize dg. > dg-init > torture-init Gr=C3=BC=C3=9Fe Thomas