From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-patches-return-454039-listarch-gcc-patches=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 17138 invoked by alias); 19 May 2017 11:03:13 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Received: (qmail 17111 invoked by uid 89); 19 May 2017 11:03:11 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-24.5 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,RCVD_IN_DNSWL_NONE,SPF_PASS,URIBL_RED autolearn=ham version=3.3.2 spammy=1524
X-HELO: relay1.mentorg.com
Received: from relay1.mentorg.com (HELO relay1.mentorg.com) (192.94.38.131) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 19 May 2017 11:03:08 +0000
Received: from nat-ies.mentorg.com ([192.94.31.2] helo=svr-ies-mbx-01.mgc.mentorg.com)	by relay1.mentorg.com with esmtp 	id 1dBfgb-0005fQ-7E from Thomas_Schwinge@mentor.com ; Fri, 19 May 2017 04:03:09 -0700
Received: from hertz.schwinge.homeip.net (137.202.0.87) by svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) with Microsoft SMTP Server (TLS) id 15.0.1210.3; Fri, 19 May 2017 12:03:05 +0100
From: Thomas Schwinge <thomas@codesourcery.com>
To: <gcc-patches@gcc.gnu.org>, Jakub Jelinek <jakub@redhat.com>
Subject: Re: Runtime checking of OpenACC parallelism dimensions clauses
In-Reply-To: <87ziej37kq.fsf@hertz.schwinge.homeip.net>
References: <87ziej37kq.fsf@hertz.schwinge.homeip.net>
User-Agent: Notmuch/0.9-101-g81dad07 (http://notmuchmail.org) Emacs/24.5.1 (x86_64-pc-linux-gnu)
Date: Fri, 19 May 2017 11:03:00 -0000
Message-ID: <871srl153s.fsf@hertz.schwinge.homeip.net>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) To svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1)
X-SW-Source: 2017-05/txt/msg01549.txt.bz2

Hi!

Ping.

On Thu, 11 May 2017 14:24:05 +0200, I wrote:
> OK for trunk?
>=20
> commit 0ba48b4faf85420fbe12971afdd6e0afe70778bb
> Author: Thomas Schwinge <thomas@codesourcery.com>
> Date:   Fri May 5 16:41:59 2017 +0200
>=20
>     Runtime checking of OpenACC parallelism dimensions clauses
>=20=20=20=20=20
>             libgomp/
>             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrit=
e.
>             * testsuite/lib/libgomp.exp
>             (check_effective_target_openacc_nvidia_accel_configured): New
>             proc.
>             * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_=
c)
>             (check_effective_target_c++): New procs.
>             * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
>             (check_effective_target_c++): Likewise.
> ---
>  libgomp/testsuite/lib/libgomp.exp                  |  12 +
>  libgomp/testsuite/libgomp.oacc-c++/c++.exp         |   7 +
>  .../libgomp.oacc-c-c++-common/parallel-dims.c      | 523 +++++++++++++++=
+++++-
>  libgomp/testsuite/libgomp.oacc-c/c.exp             |   7 +
>  4 files changed, 537 insertions(+), 12 deletions(-)
>=20
> diff --git libgomp/testsuite/lib/libgomp.exp libgomp/testsuite/lib/libgom=
p.exp
> index 5e47872..62ee2e3 100644
> --- libgomp/testsuite/lib/libgomp.exp
> +++ libgomp/testsuite/lib/libgomp.exp
> @@ -358,6 +358,18 @@ proc check_effective_target_offload_device_shared_as=
 { } {
>      } ]
>  }
>=20=20
> +# Return 1 if configured for nvptx offloading.
> +
> +proc check_effective_target_openacc_nvidia_accel_configured { } {
> +    global offload_targets
> +    if { ![string match "*,nvptx,*" ",$offload_targets,"] } {
> +        return 0
> +    }
> +    # PR libgomp/65099: Currently, we only support offloading in 64-bit
> +    # configurations.
> +    return [is-effective-target lp64]
> +}
> +
>  # Return 1 if at least one nvidia board is present.
>=20=20
>  proc check_effective_target_openacc_nvidia_accel_present { } {
> diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp libgomp/testsuite/l=
ibgomp.oacc-c++/c++.exp
> index 608b298..9beadd6 100644
> --- libgomp/testsuite/libgomp.oacc-c++/c++.exp
> +++ libgomp/testsuite/libgomp.oacc-c++/c++.exp
> @@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
>  load_gcc_lib gcc-dg.exp
>  load_gcc_lib torture-options.exp
>=20=20
> +proc check_effective_target_c { } {
> +    return 0
> +}
> +proc check_effective_target_c++ { } {
> +    return 1
> +}
> +
>  global shlib_ext
>=20=20
>  set shlib_ext [get_shlib_extension]
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c li=
bgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> index f5766a4..d8af546 100644
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> @@ -1,25 +1,524 @@
> -/* { dg-do run { target openacc_nvidia_accel_selected } } */
> +/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
> +   vector_length.  */
> +
> +#include <limits.h>
> +#include <openacc.h>
> +
> +/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wra=
pper
> +   not behaving as expected for -O0.  */
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
> +{
> +  if (acc_on_device ((int) acc_device_host))
> +    return 0;
> +  else if (acc_on_device ((int) acc_device_nvidia))
> +    {
> +      unsigned int r;
> +      asm volatile ("mov.u32 %0,%%ctaid.x;" : "=3Dr" (r));
> +      return r;
> +    }
> +  else
> +    __builtin_abort ();
> +}
> +
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
> +{
> +  if (acc_on_device ((int) acc_device_host))
> +    return 0;
> +  else if (acc_on_device ((int) acc_device_nvidia))
> +    {
> +      unsigned int r;
> +      asm volatile ("mov.u32 %0,%%tid.y;" : "=3Dr" (r));
> +      return r;
> +    }
> +  else
> +    __builtin_abort ();
> +}
> +
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
> +{
> +  if (acc_on_device ((int) acc_device_host))
> +    return 0;
> +  else if (acc_on_device ((int) acc_device_nvidia))
> +    {
> +      unsigned int r;
> +      asm volatile ("mov.u32 %0,%%tid.x;" : "=3Dr" (r));
> +      return r;
> +    }
> +  else
> +    __builtin_abort ();
> +}
>=20=20
> -/* Worker and vector size checks.  Picked an outrageously large
> -   value. */
>=20=20
>  int main ()
>  {
> -  int dummy[10];
> +  acc_init (acc_device_default);
>=20=20
> -#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_worker=
s" } */
> +  /* Non-positive value.  */
> +
> +  /* GR, WS, VS.  */
> +  {
> +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" =
{ target c } } */
> +    int gangs_actual =3D GANGS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: g=
angs_max, workers_max, vectors_max) \
> +  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive"=
 "" { target c++ } } */
> +    {
> +      /* We're actually executing with num_gangs (1).  */
> +      gangs_actual =3D 1;
> +      for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +	{
> +	  /* <https://gcc.gnu.org/PR80547>.  */
> +#if 0
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +#else
> +	  int gangs =3D acc_gang ();
> +	  gangs_min =3D (gangs_min < gangs) ? gangs_min : gangs;
> +	  gangs_max =3D (gangs_max > gangs) ? gangs_max : gangs;
> +	  int workers =3D acc_worker ();
> +	  workers_min =3D (workers_min < workers) ? workers_min : workers;
> +	  workers_max =3D (workers_max > workers) ? workers_max : workers;
> +	  int vectors =3D acc_vector ();
> +	  vectors_min =3D (vectors_min < vectors) ? vectors_min : vectors;
> +	  vectors_max =3D (vectors_max > vectors) ? vectors_max : vectors;
> +#endif
> +	}
> +    }
> +    if (gangs_actual !=3D 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1
> +	|| workers_min !=3D 0 || workers_max !=3D 0
> +	|| vectors_min !=3D 0 || vectors_max !=3D 0)
> +      __builtin_abort ();
> +#undef GANGS
> +  }
> +
> +  /* GP, WS, VS.  */
> +  {
> +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" =
{ target c } } */
> +    int gangs_actual =3D GANGS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive"=
 "" { target c++ } } */
> +    {
> +      /* We're actually executing with num_gangs (1).  */
> +      gangs_actual =3D 1;
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_mi=
n) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (gangs_actual !=3D 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1
> +	|| workers_min !=3D 0 || workers_max !=3D 0
> +	|| vectors_min !=3D 0 || vectors_max !=3D 0)
> +      __builtin_abort ();
> +#undef GANGS
> +  }
> +
> +  /* GR, WP, VS.  */
> +  {
> +#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive"=
 "" { target c } } */
> +    int workers_actual =3D WORKERS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (workers_actual) \
> +  num_workers (WORKERS) /* { dg-warning "'num_workers' value must be pos=
itive" "" { target c++ } } */
> +    {
> +      /* We're actually executing with num_workers (1).  */
> +      workers_actual =3D 1;
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * workers_actual; i > -100 * workers_actual; --=
i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (workers_actual !=3D 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D 0
> +	|| workers_min !=3D 0 || workers_max !=3D workers_actual - 1
> +	|| vectors_min !=3D 0 || vectors_max !=3D 0)
> +      __builtin_abort ();
> +#undef WORKERS
> +  }
> +
> +  /* GR, WS, VP.  */
> +  {
> +#define VECTORS 0 /* { dg-warning "'vector_length' value must be positiv=
e" "" { target c } } */
> +    int vectors_actual =3D VECTORS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector=
_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured }=
 } */ \
> +  vector_length (VECTORS) /* { dg-warning "'vector_length' value must be=
 positive" "" { target c++ } } */
> +    {
> +      /* We're actually executing with vector_length (1), just the GCC n=
vptx
> +	 back end enforces vector_length (32).  */
> +      if (acc_on_device (acc_device_nvidia))
> +	vectors_actual =3D 32;
> +      else
> +	vectors_actual =3D 1;
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * vectors_actual; i > -100 * vectors_actual; --=
i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (acc_get_device_type () =3D=3D acc_device_nvidia)
> +      {
> +	if (vectors_actual !=3D 32)
> +	  __builtin_abort ();
> +      }
> +    else
> +      if (vectors_actual !=3D 1)
> +	__builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D 0
> +	|| workers_min !=3D 0 || workers_max !=3D 0
> +	|| vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1)
> +      __builtin_abort ();
> +#undef VECTORS
> +  }
> +
> +
> +  /* High value.  */
> +=20=20
> +  /* GR, WS, VS.  */
> +  {
> +    /* There is no actual limit for the number of gangs, so we try with a
> +       rather high value.  */
> +    int gangs =3D 12345;
> +    int gangs_actual =3D gangs;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: g=
angs_max, workers_max, vectors_max) \
> +  num_gangs (gangs)
> +    {
> +      if (acc_on_device (acc_device_host))
> +	{
> +	  /* We're actually executing with num_gangs (1).  */
> +	  gangs_actual =3D 1;
> +	}
> +      /* As we're executing GR not GP, don't multiply with a "gangs_actu=
al"
> +	 factor.  */
> +      for (int i =3D 100 /* * gangs_actual */; i > -100 /* * gangs_actua=
l */; --i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (gangs_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1
> +	|| workers_min !=3D 0 || workers_max !=3D 0
> +	|| vectors_min !=3D 0 || vectors_max !=3D 0)
> +      __builtin_abort ();
> +  }
> +
> +  /* GP, WS, VS.  */
> +  {
> +    /* There is no actual limit for the number of gangs, so we try with a
> +       rather high value.  */
> +    int gangs =3D 12345;
> +    int gangs_actual =3D gangs;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> +  num_gangs (gangs)
> +    {
> +      if (acc_on_device (acc_device_host))
> +	{
> +	  /* We're actually executing with num_gangs (1).  */
> +	  gangs_actual =3D 1;
> +	}
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_mi=
n) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (gangs_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1
> +	|| workers_min !=3D 0 || workers_max !=3D 0
> +	|| vectors_min !=3D 0 || vectors_max !=3D 0)
> +      __builtin_abort ();
> +  }
> +
> +  /* GR, WP, VS.  */
> +  {
> +    /* We try with an outrageously large value. */
> +#define WORKERS 2 << 20
> +    int workers_actual =3D WORKERS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_wo=
rkers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configur=
ed } } */ \
> +  num_workers (WORKERS)
> +    {
> +      if (acc_on_device (acc_device_host))
> +	{
> +	  /* We're actually executing with num_workers (1).  */
> +	  workers_actual =3D 1;
> +	}
> +      else if (acc_on_device (acc_device_nvidia))
> +	{
> +	  /* The GCC nvptx back end enforces num_workers (32).  */
> +	  workers_actual =3D 32;
> +	}
> +      else
> +	__builtin_abort ();
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * workers_actual; i > -100 * workers_actual; --=
i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (workers_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D 0
> +	|| workers_min !=3D 0 || workers_max !=3D workers_actual - 1
> +	|| vectors_min !=3D 0 || vectors_max !=3D 0)
> +      __builtin_abort ();
> +#undef WORKERS
> +  }
> +
> +  /* GR, WP, VS.  */
> +  {
> +    /* We try with an outrageously large value. */
> +    int workers =3D 2 << 20;
> +    /* For nvptx offloading, this one will not result in "using num_work=
ers
> +       (32), ignoring runtime setting", and will in fact try to launch w=
ith
> +       "num_workers (workers)", which will run into "libgomp: cuLaunchKe=
rnel
> +       error: invalid argument".  So, limit ourselves here.  */
> +    if (acc_get_device_type () =3D=3D acc_device_nvidia)
> +      workers =3D 32;
> +    int workers_actual =3D workers;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (workers_actual) \
> +  num_workers (workers)
> +    {
> +      if (acc_on_device (acc_device_host))
> +	{
> +	  /* We're actually executing with num_workers (1).  */
> +	  workers_actual =3D 1;
> +	}
> +      else if (acc_on_device (acc_device_nvidia))
> +	{
> +	  /* We're actually executing with num_workers (32).  */
> +	  /* workers_actual =3D 32; */
> +	}
> +      else
> +	__builtin_abort ();
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * workers_actual; i > -100 * workers_actual; --=
i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (workers_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D 0
> +	|| workers_min !=3D 0 || workers_max !=3D workers_actual - 1
> +	|| vectors_min !=3D 0 || vectors_max !=3D 0)
> +      __builtin_abort ();
> +  }
> +
> +  /* GR, WS, VP.  */
>    {
> -#pragma acc loop worker
> -    for (int  i =3D 0; i < 10; i++)
> -      dummy[i] =3D i;
> +    /* We try with an outrageously large value. */
> +#define VECTORS 2 << 20
> +    int vectors_actual =3D VECTORS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector=
_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_config=
ured } } */ \
> +  vector_length (VECTORS)
> +    {
> +      if (acc_on_device (acc_device_host))
> +	{
> +	  /* We're actually executing with vector_length (1).  */
> +	  vectors_actual =3D 1;
> +	}
> +      else if (acc_on_device (acc_device_nvidia))
> +	{
> +	  /* The GCC nvptx back end enforces vector_length (32).  */
> +	  vectors_actual =3D 32;
> +	}
> +      else
> +	__builtin_abort ();
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * vectors_actual; i > -100 * vectors_actual; --=
i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (vectors_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D 0
> +	|| workers_min !=3D 0 || workers_max !=3D 0
> +	|| vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1)
> +      __builtin_abort ();
> +#undef VECTORS
>    }
>=20=20
> -#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_l=
ength" } */
> +  /* GR, WS, VP.  */
>    {
> -#pragma acc loop vector
> -    for (int  i =3D 0; i < 10; i++)
> -      dummy[i] =3D i;
> +    /* We try with an outrageously large value. */
> +    int vectors =3D 2 << 20;
> +    int vectors_actual =3D vectors;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector=
_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_acce=
l_configured } } */ \
> +  vector_length (vectors)
> +    {
> +      if (acc_on_device (acc_device_host))
> +	{
> +	  /* We're actually executing with vector_length (1).  */
> +	  vectors_actual =3D 1;
> +	}
> +      else if (acc_on_device (acc_device_nvidia))
> +	{
> +	  /* The GCC nvptx back end enforces vector_length (32).  */
> +	  vectors_actual =3D 32;
> +	}
> +      else
> +	__builtin_abort ();
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * vectors_actual; i > -100 * vectors_actual; --=
i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (vectors_actual < 1)
> +      __builtin_abort ();
> +    if (gangs_min !=3D 0 || gangs_max !=3D 0
> +	|| workers_min !=3D 0 || workers_max !=3D 0
> +	|| vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1)
> +      __builtin_abort ();
>    }
>=20=20
> +
> +  /* Composition of GP, WP, VP.  */
> +  {
> +    int gangs =3D 12345;
> +    /* With nvptx offloading, multi-level reductions apparently are very=
 slow
> +       in the following case.  So, limit ourselves here.  */
> +    if (acc_get_device_type () =3D=3D acc_device_nvidia)
> +      gangs =3D 3;
> +    int gangs_actual =3D gangs;
> +#define WORKERS 3
> +    int workers_actual =3D WORKERS;
> +#define VECTORS 11
> +    int vectors_actual =3D VECTORS;
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual)=
 /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target op=
enacc_nvidia_accel_configured } } */ \
> +  num_gangs (gangs) \
> +  num_workers (WORKERS) \
> +  vector_length (VECTORS)
> +    {
> +      if (acc_on_device (acc_device_host))
> +	{
> +	  /* We're actually executing with num_gangs (1), num_workers (1),
> +	     vector_length (1).  */
> +	  gangs_actual =3D 1;
> +	  workers_actual =3D 1;
> +	  vectors_actual =3D 1;
> +	}
> +      else if (acc_on_device (acc_device_nvidia))
> +	{
> +	  /* The GCC nvptx back end enforces vector_length (32).  */
> +	  vectors_actual =3D 32;
> +	}
> +      else
> +	__builtin_abort ();
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_mi=
n) reduction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +	for (int j =3D 100 * workers_actual; j > -100 * workers_actual; --j)
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_=
min) reduction (max: gangs_max, workers_max, vectors_max)
> +	  for (int k =3D 100 * vectors_actual; k > -100 * vectors_actual; --k)
> +	    {
> +	      gangs_min =3D gangs_max =3D acc_gang ();
> +	      workers_min =3D workers_max =3D acc_worker ();
> +	      vectors_min =3D vectors_max =3D acc_vector ();
> +	    }
> +    }
> +    if (gangs_min !=3D 0 || gangs_max !=3D gangs_actual - 1
> +	|| workers_min !=3D 0 || workers_max !=3D workers_actual - 1
> +	|| vectors_min !=3D 0 || vectors_max !=3D vectors_actual - 1)
> +      __builtin_abort ();
> +#undef VECTORS
> +#undef WORKERS
> +  }
> +
> +
> +  /* We can't test parallelized OpenACC kernels constructs in this way: =
use of
> +     the acc_gang, acc_worker, acc_vector functions will make the constr=
uct
> +     unparallelizable.  */
> +
> +
> +  /* Unparallelized OpenACC kernels constructs must get launched as 1 x =
1 x 1
> +     kernels.  */
> +  {
> +    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vec=
tors_max;
> +    gangs_min =3D workers_min =3D vectors_min =3D INT_MAX;
> +    gangs_max =3D workers_max =3D vectors_max =3D INT_MIN;
> +#pragma acc kernels
> +    {
> +      /* This is to make the OpenACC kernels construct unparallelizable.=
  */
> +      asm volatile ("" : : : "memory");
> +
> +#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) re=
duction (max: gangs_max, workers_max, vectors_max)
> +      for (int i =3D 100; i > -100; --i)
> +	{
> +	  gangs_min =3D gangs_max =3D acc_gang ();
> +	  workers_min =3D workers_max =3D acc_worker ();
> +	  vectors_min =3D vectors_max =3D acc_vector ();
> +	}
> +    }
> +    if (gangs_min !=3D 0 || gangs_max !=3D 1 - 1
> +	|| workers_min !=3D 0 || workers_max !=3D 1 - 1
> +	|| vectors_min !=3D 0 || vectors_max !=3D 1 - 1)
> +      __builtin_abort ();
> +  }
> +
> +
>    return 0;
>  }
> diff --git libgomp/testsuite/libgomp.oacc-c/c.exp libgomp/testsuite/libgo=
mp.oacc-c/c.exp
> index b509a10..4475bf5 100644
> --- libgomp/testsuite/libgomp.oacc-c/c.exp
> +++ libgomp/testsuite/libgomp.oacc-c/c.exp
> @@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
>  load_gcc_lib gcc-dg.exp
>  load_gcc_lib torture-options.exp
>=20=20
> +proc check_effective_target_c { } {
> +    return 1
> +}
> +proc check_effective_target_c++ { } {
> +    return 0
> +}
> +
>  # Initialize dg.
>  dg-init
>  torture-init


Gr=C3=BC=C3=9Fe
 Thomas