From: Tom de Vries <Tom_deVries@mentor.com>
To: Cesar Philippidis <cesar@codesourcery.com>,
"gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>
Subject: Re: [og7] vector_length extension part 4: target hooks and automatic parallelism
Date: Thu, 05 Apr 2018 16:32:00 -0000 [thread overview]
Message-ID: <bf45b83e-e7d9-bc65-a9a1-4439fa3520ee@mentor.com> (raw)
In-Reply-To: <0e3891f9-aec9-2d34-f58f-6927c821d00d@codesourcery.com>
[-- Attachment #1: Type: text/plain, Size: 420 bytes --]
On 03/02/2018 08:18 PM, Cesar Philippidis wrote:
> The attached patch adjusts the existing goacc validate_dims target hook
> and introduces a new goacc adjust_parallelism target hook.
The attached patch now just introduces the nvptx_adjust_parallelism
target hook implementation, which enables test-cases to start using the
feature.
Build x86_64 with nvptx accelerator and tested libgomp.
Committed.
Thanks,
- Tom
[-- Attachment #2: 0002-nvptx-Enable-large-vectors.patch --]
[-- Type: text/x-patch, Size: 13493 bytes --]
[nvptx] Enable large vectors
2018-04-05 Cesar Philippidis <cesar@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
* omp-offload.c (oacc_get_default_dim): New function.
* omp-offload.h (oacc_get_default_dim): Declare.
* config/nvptx/nvptx.c (NVPTX_GOACC_VL_WARP): Define.
(nvptx_goacc_needs_vl_warp): New function.
(nvptx_goacc_validate_dims): Take larger vector lengths into
account.
(nvptx_adjust_parallelism): New function.
(TARGET_GOACC_ADJUST_PARALLELISM): Define.
(populate_offload_attrs): Handle the situation where the default
runtime geometry has not been initialized yet for reductions.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c: Expect
vector length to be 128.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c: Same.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c: Same.
* testsuite/libgomp.oacc-c-c++-common/vred2d-128.c: Same.
* testsuite/libgomp.oacc-fortran/gemm.f90: Same.
---
gcc/config/nvptx/nvptx.c | 148 +++++++++++++++++++--
gcc/omp-offload.c | 7 +
gcc/omp-offload.h | 2 +
.../vector-length-128-1.c | 5 +-
.../vector-length-128-10.c | 1 -
.../vector-length-128-2.c | 5 +-
.../libgomp.oacc-c-c++-common/vred2d-128.c | 2 -
libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 | 1 -
8 files changed, 153 insertions(+), 18 deletions(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 51bd69d..595413a 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -71,6 +71,7 @@
#include "fold-const.h"
#include "intl.h"
#include "tree-hash-traits.h"
+#include "omp-offload.h"
/* This file should be included last. */
#include "target-def.h"
@@ -4634,15 +4635,20 @@ populate_offload_attrs (offload_attrs *oa)
if (oa->vector_length == 0)
{
/* FIXME: Need a more graceful way to handle large vector
- lengths in OpenACC routines. */
+ lengths in OpenACC routines and also -fopenacc-dims. */
if (!lookup_attribute ("omp target entrypoint",
DECL_ATTRIBUTES (current_function_decl)))
oa->vector_length = PTX_WARP_SIZE;
- else
+ else if (PTX_VECTOR_LENGTH != PTX_WARP_SIZE)
oa->vector_length = PTX_VECTOR_LENGTH;
}
if (oa->num_workers == 0)
- oa->max_workers = PTX_CTA_SIZE / oa->vector_length;
+ {
+ if (oa->vector_length == 0)
+ oa->max_workers = PTX_WORKER_LENGTH;
+ else
+ oa->max_workers = PTX_CTA_SIZE / oa->vector_length;
+ }
else
oa->max_workers = oa->num_workers;
}
@@ -5193,6 +5199,19 @@ nvptx_simt_vf ()
return PTX_WARP_SIZE;
}
+#define NVPTX_GOACC_VL_WARP "nvptx vl warp"
+
+/* Return true of the offloaded function needs a vector_length of
+ PTX_WARP_SIZE. */
+
+static bool
+nvptx_goacc_needs_vl_warp ()
+{
+ tree attr = lookup_attribute (NVPTX_GOACC_VL_WARP,
+ DECL_ATTRIBUTES (current_function_decl));
+ return attr != NULL_TREE;
+}
+
/* Validate compute dimensions of an OpenACC offload or routine, fill
in non-unity defaults. FN_LEVEL indicates the level at which a
routine might spawn a loop. It is negative for non-routines. If
@@ -5201,6 +5220,14 @@ nvptx_simt_vf ()
static bool
nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
{
+ int default_vector_length = PTX_VECTOR_LENGTH;
+
+ /* For capability reasons, fallback to vl = 32 for runtime values. */
+ if (dims[GOMP_DIM_VECTOR] == 0)
+ default_vector_length = PTX_WARP_SIZE;
+ else if (decl)
+ default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
+
/* Detect if a function is unsuitable for offloading. */
if (!flag_offload_force && decl)
{
@@ -5225,18 +5252,20 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
bool changed = false;
- /* The vector size must be 32, unless this is a SEQ routine. */
+ /* The vector size must be a positive multiple of the warp size,
+ unless this is a SEQ routine. */
if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
&& dims[GOMP_DIM_VECTOR] >= 0
- && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
+ && (dims[GOMP_DIM_VECTOR] % 32 != 0
+ || dims[GOMP_DIM_VECTOR] == 0))
{
if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
dims[GOMP_DIM_VECTOR]
? G_("using vector_length (%d), ignoring %d")
: G_("using vector_length (%d), ignoring runtime setting"),
- PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
- dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
+ default_vector_length, dims[GOMP_DIM_VECTOR]);
+ dims[GOMP_DIM_VECTOR] = default_vector_length;
changed = true;
}
@@ -5250,16 +5279,77 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
changed = true;
}
+ /* Ensure that num_worker * vector_length < cta size. */
+ if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
+ {
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
+ G_("using vector_length (%d), ignoring %d"),
+ default_vector_length, dims[GOMP_DIM_VECTOR]);
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+ changed = true;
+ }
+
+ /* vector_length must not exceed PTX_CTA_SIZE. */
+ if (dims[GOMP_DIM_VECTOR] >= PTX_CTA_SIZE)
+ {
+ int new_vector = PTX_CTA_SIZE;
+ if (decl)
+ new_vector = default_vector_length;
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
+ G_("using vector_length (%d), ignoring %d"),
+ new_vector, dims[GOMP_DIM_VECTOR]);
+ dims[GOMP_DIM_VECTOR] = new_vector;
+ changed = true;
+ }
+
+ /* Set vector_length to default_vector_length if there are a sufficient
+ number of free threads in the CTA. */
+ if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] <= 0)
+ {
+ if (dims[GOMP_DIM_WORKER] * default_vector_length <= PTX_CTA_SIZE)
+ dims[GOMP_DIM_VECTOR] = default_vector_length;
+ else if (dims[GOMP_DIM_WORKER] * PTX_WARP_SIZE <= PTX_CTA_SIZE)
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+ else
+ error_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
+ "vector_length must be at least 32");
+ changed = true;
+ }
+
+ /* Specify a default vector_length. */
+ if (dims[GOMP_DIM_VECTOR] < 0)
+ {
+ dims[GOMP_DIM_VECTOR] = default_vector_length;
+ changed = true;
+ }
+
+ if (nvptx_goacc_needs_vl_warp () && dims[GOMP_DIM_VECTOR] != PTX_WARP_SIZE)
+ {
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+ changed = true;
+ }
+
if (!decl)
{
- dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
+ bool new_vector = false;
+ if (dims[GOMP_DIM_VECTOR] <= 1)
+ {
+ dims[GOMP_DIM_VECTOR] = default_vector_length;
+ new_vector = true;
+ }
if (dims[GOMP_DIM_WORKER] < 0)
dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
if (dims[GOMP_DIM_GANG] < 0)
dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
+ if (new_vector
+ && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
changed = true;
}
+ gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
+ gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
+
return changed;
}
@@ -5279,6 +5369,45 @@ nvptx_dim_limit (int axis)
return 0;
}
+/* Adjust the parallelism available to a loop given vector_length
+ associated with the offloaded function. */
+
+static unsigned
+nvptx_adjust_parallelism (unsigned inner_mask, unsigned outer_mask)
+{
+ if (nvptx_goacc_needs_vl_warp ())
+ return inner_mask;
+
+ bool wv = (inner_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ && (inner_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+ offload_attrs oa;
+
+ populate_offload_attrs (&oa);
+
+ if (oa.vector_length == PTX_WARP_SIZE)
+ return inner_mask;
+
+ /* FIXME: This is overly conservative; worker and vector loop will
+ eventually be combined. */
+ if (wv)
+ return inner_mask & ~GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
+ /* It's difficult to guarantee that warps in large vector_lengths
+ will remain convergent when a vector loop is nested inside a
+ worker loop. Therefore, fallback to setting vector_length to
+ PTX_WARP_SIZE. Hopefully this condition may be relaxed for
+ sm_70+ targets. */
+ if ((inner_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+ {
+ tree attr = tree_cons (get_identifier (NVPTX_GOACC_VL_WARP), NULL_TREE,
+ DECL_ATTRIBUTES (current_function_decl));
+ DECL_ATTRIBUTES (current_function_decl) = attr;
+ }
+
+ return inner_mask;
+}
+
/* Determine whether fork & joins are needed. */
static bool
@@ -6169,6 +6298,9 @@ nvptx_set_current_function (tree fndecl)
#undef TARGET_GOACC_DIM_LIMIT
#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
+#undef TARGET_GOACC_ADJUST_PARALLELISM
+#define TARGET_GOACC_ADJUST_PARALLELISM nvptx_adjust_parallelism
+
#undef TARGET_GOACC_FORK_JOIN
#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index ed17160..66c6212 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -551,6 +551,13 @@ oacc_xform_tile (gcall *call)
static int oacc_default_dims[GOMP_DIM_MAX];
static int oacc_min_dims[GOMP_DIM_MAX];
+int
+oacc_get_default_dim (int dim)
+{
+ gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
+ return oacc_default_dims[dim];
+}
+
/* Parse the default dimension parameter. This is a set of
:-separated optional compute dimensions. Each dimension is either
a positive integer, or '-' for a dynamic value computed at
diff --git a/gcc/omp-offload.h b/gcc/omp-offload.h
index 528448b..014ee52 100644
--- a/gcc/omp-offload.h
+++ b/gcc/omp-offload.h
@@ -22,6 +22,8 @@ along with GCC; see the file COPYING3. If not see
#ifndef GCC_OMP_DEVICE_H
#define GCC_OMP_DEVICE_H
+extern int oacc_get_default_dim (int dim);
+
extern GTY(()) vec<tree, va_gc> *offload_funcs;
extern GTY(()) vec<tree, va_gc> *offload_vars;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c
index fab5b0d..18d77cc 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c
@@ -33,7 +33,6 @@ main (void)
return 0;
}
-/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */
-/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 32\\)" "oaccdevlow" } } */
-/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=32" } */
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 128\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=128" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c
index e46b5cf..0658cfd 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c
@@ -37,4 +37,3 @@ main (void)
return 0;
}
-/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c
index cc6fd55..2ab6499 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c
@@ -34,7 +34,6 @@ main (void)
return 0;
}
-/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */
-/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 32\\)" "oaccdevlow" } } */
-/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=32" } */
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 128\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=128" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c
index 1dc5fe0..318c0e6 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c
@@ -42,8 +42,6 @@ gentest (test3, "acc parallel loop gang worker vector_length (128)",
gentest (test4, "acc parallel loop",
"acc loop reduction(+:t1) reduction(-:t2)")
-/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */
-
int
main ()
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
index 62b8a45..ad67dce 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
@@ -39,7 +39,6 @@ subroutine openacc_sgemm_128 (m, n, k, alpha, a, b, beta, c)
real :: temp
!$acc parallel loop copy(c(1:m,1:n)) copyin(a(1:k,1:m),b(1:k,1:n)) vector_length (128)
- ! { dg-prune-output "using vector_length \\(32\\), ignoring 128" }
do j = 1, n
!$acc loop
do i = 1, m
next prev parent reply other threads:[~2018-04-05 16:32 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-03-01 21:17 [og7] vector_length extension part 1: generalize function and variable names Cesar Philippidis
2018-03-02 16:55 ` [og7] vector_length extension part 2: Generalize state propagation and synchronization Cesar Philippidis
2018-03-21 17:16 ` Tom de Vries
2018-03-22 8:05 ` Cesar Philippidis
2018-03-22 14:16 ` Tom de Vries
2018-03-22 14:35 ` Cesar Philippidis
2018-03-22 14:24 ` Tom de Vries
2018-03-22 15:18 ` Cesar Philippidis
2018-03-22 16:20 ` Tom de Vries
2018-03-22 17:26 ` Cesar Philippidis
2018-03-22 17:58 ` Tom de Vries
2018-03-22 19:32 ` Cesar Philippidis
2018-03-23 8:56 ` Tom de Vries
2018-03-23 14:35 ` Tom de Vries
2018-03-22 15:04 ` Tom de Vries
2018-03-22 17:14 ` Cesar Philippidis
2018-03-22 17:47 ` Tom de Vries
2018-03-22 17:48 ` Cesar Philippidis
2018-03-22 18:00 ` Tom de Vries
2018-03-23 13:14 ` Tom de Vries
2018-03-23 13:16 ` Tom de Vries
2018-03-23 14:18 ` Tom de Vries
2018-03-23 16:30 ` Tom de Vries
2018-03-30 1:50 ` Tom de Vries
2018-03-30 14:48 ` Tom de Vries
2018-03-30 15:06 ` Cesar Philippidis
2018-03-30 15:35 ` Tom de Vries
2018-04-05 16:33 ` Tom de Vries
2018-04-03 14:52 ` [nvptx] Use MAX, MIN, ROUND_UP macros Tom de Vries
2018-04-03 15:00 ` [og7] vector_length extension part 2: Generalize state propagation and synchronization Tom de Vries
2018-04-05 14:06 ` Tom de Vries
2018-04-05 14:14 ` Tom de Vries
2018-03-02 17:51 ` [og7] vector_length extension part 3: reductions Cesar Philippidis
2018-04-05 14:07 ` Tom de Vries
2018-04-05 16:26 ` Tom de Vries
2018-03-02 19:18 ` [og7] vector_length extension part 4: target hooks and automatic parallelism Cesar Philippidis
2018-03-21 15:55 ` Tom de Vries
2018-03-21 20:28 ` Cesar Philippidis
2018-03-26 14:25 ` Tom de Vries
2018-03-26 14:37 ` Cesar Philippidis
2018-03-26 16:52 ` Tom de Vries
2018-03-27 12:16 ` Tom de Vries
2018-03-26 17:13 ` Tom de Vries
2018-04-05 16:32 ` Tom de Vries [this message]
2018-03-02 20:47 ` [og7] vector_length extension part 5: libgomp and tests Cesar Philippidis
2018-03-16 13:50 ` Thomas Schwinge
2018-03-27 13:00 ` Tom de Vries
2018-04-05 16:36 ` Tom de Vries
2018-03-09 15:29 ` [og7] vector_length extension part 1: generalize function and variable names Thomas Schwinge
2018-03-09 15:31 ` Cesar Philippidis
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=bf45b83e-e7d9-bc65-a9a1-4439fa3520ee@mentor.com \
--to=tom_devries@mentor.com \
--cc=cesar@codesourcery.com \
--cc=gcc-patches@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).