public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-2819] amdgcn: Enable OpenACC worker partitioning for AMD GCN
@ 2021-08-09 13:20 Thomas Schwinge
0 siblings, 0 replies; only message in thread
From: Thomas Schwinge @ 2021-08-09 13:20 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:c408512e1f7ca07e07794dc13fd6dfd9d2d7e998
commit r12-2819-gc408512e1f7ca07e07794dc13fd6dfd9d2d7e998
Author: Julian Brown <julian@codesourcery.com>
Date: Tue Mar 2 04:20:13 2021 -0800
amdgcn: Enable OpenACC worker partitioning for AMD GCN
gcc/
* config/gcn/gcn.c (gcn_init_builtins): Override decls for
BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START,
BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER.
(gcn_goacc_validate_dims): Turn on worker partitioning unconditionally.
(gcn_fork_join): Update comment.
* config/gcn/gcn.opt (flag_worker_partitioning): Remove.
(macc_experimental_workers): Remove unused option.
libgomp/
* plugin/plugin-gcn.c (gcn_exec): Change default number of workers to
16.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
[acc_device_radeon]: Update.
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
[ACC_DEVICE_TYPE_radeon]: Likewise.
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
[acc_device_radeon]: Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
[ACC_DEVICE_TYPE_radeon]: Likewise.
* testsuite/libgomp.oacc-fortran/optional-reduction.f90: XFAIL for
'openacc_radeon_accel_selected' and '-O0'.
* testsuite/libgomp.oacc-fortran/reduction-7.f90: Likewise.
Co-Authored-By: Kwok Cheung Yeung <kcy@codesourcery.com>
Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
Diff:
---
gcc/config/gcn/gcn.c | 15 +++------------
gcc/config/gcn/gcn.opt | 5 -----
libgomp/plugin/plugin-gcn.c | 3 +--
.../libgomp.oacc-c-c++-common/acc_prof-kernels-1.c | 3 ---
.../libgomp.oacc-c-c++-common/loop-dim-default.c | 4 +---
.../testsuite/libgomp.oacc-c-c++-common/parallel-dims.c | 12 ++++--------
.../testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c | 7 ++++---
.../testsuite/libgomp.oacc-fortran/optional-reduction.f90 | 3 +++
libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90 | 3 +++
9 files changed, 19 insertions(+), 36 deletions(-)
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 87af5d18f42..9df28277498 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -3712,8 +3712,6 @@ gcn_init_builtins (void)
TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
}
-/* FIXME: remove the ifdef once OpenACC support is merged upstream. */
-#ifdef BUILT_IN_GOACC_SINGLE_START
/* These builtins need to take/return an LDS pointer: override the generic
versions here. */
@@ -3730,7 +3728,6 @@ gcn_init_builtins (void)
set_builtin_decl (BUILT_IN_GOACC_BARRIER,
gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
-#endif
}
/* Implement TARGET_INIT_LIBFUNCS. */
@@ -5019,11 +5016,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
unsigned /*used*/)
{
bool changed = false;
-
- /* FIXME: remove -facc-experimental-workers when they're ready. */
- int max_workers = flag_worker_partitioning ? 16 : 1;
-
- gcc_assert (!flag_worker_partitioning);
+ const int max_workers = 16;
/* The vector size must appear to be 64, to the user, unless this is a
SEQ routine. The real, internal value is always 1, which means use
@@ -5060,8 +5053,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
{
dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
if (dims[GOMP_DIM_WORKER] < 0)
- dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
- ? GCN_DEFAULT_WORKERS : 1);
+ dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
if (dims[GOMP_DIM_GANG] < 0)
dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
changed = true;
@@ -5126,8 +5118,7 @@ static bool
gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
bool ARG_UNUSED (is_fork))
{
- /* GCN does not use the fork/join concept invented for NVPTX.
- Instead we use standard autovectorization. */
+ /* GCN does not need to expand fork/join markers at the RTL level. */
return false;
}
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index b2b10b0794c..6faacca42bb 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -62,11 +62,6 @@ bool flag_bypass_init_error = false
mbypass-init-error
Target RejectNegative Var(flag_bypass_init_error)
-bool flag_worker_partitioning = false
-
-macc-experimental-workers
-Target Var(flag_worker_partitioning) Init(0)
-
int stack_size_opt = -1
mstack-size=
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index f26d7361106..9e7377c91f9 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -3038,8 +3038,7 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
64 gangs matches a typical Fiji device. */
if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
- /* NOTE: Until support for middle-end worker partitioning is merged, force 'num_workers (1)'. */
- if (/*TODO dims[1] == 0*/ true) dims[1] = 1; /* Workers. */
+ if (dims[1] == 0) dims[1] = 16; /* Workers. */
/* The incoming dimensions are expressed in terms of gangs, workers, and
vectors. The HSA dimensions are expressed in terms of "work-items",
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
index 6c136c26c93..ad33f72e2fb 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
@@ -93,9 +93,6 @@ static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
}
if (num_workers < 1)
assert (event_info->launch_event.num_workers >= 1);
- /* GCN currently enforces 'num_workers (1)'. */
- else if (acc_device_type == acc_device_radeon)
- assert (event_info->launch_event.num_workers == 1);
else
{
#ifdef __OPTIMIZE__
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
index ca771646655..419bc33ad53 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
@@ -132,9 +132,7 @@ int main ()
/* AMD GCN uses the autovectorizer for the vector dimension: the use
of a function call in vector-partitioned code in this test is not
currently supported. */
- /* AMD GCN does not currently support multiple workers. This should be
- set to 16 when that changes. */
- return test_1 (16, 1, 1);
+ return test_1 (16, 16, 1);
#else
return test_1 (16, 16, 32);
#endif
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index fe0dacd5aac..9392e1d88c5 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -261,9 +261,8 @@ int main ()
}
else if (acc_on_device (acc_device_radeon))
{
- /* The GCC GCN back end is limited to num_workers (16).
- Temporarily set this to 1 until multiple workers are permitted. */
- workers_actual = 1; // 16;
+ /* The GCC GCN back end is limited to num_workers (16). */
+ workers_actual = 16;
}
else
__builtin_abort ();
@@ -313,9 +312,8 @@ int main ()
}
else if (acc_on_device (acc_device_radeon))
{
- /* The GCC GCN back end is limited to num_workers (16).
- Temporarily set this to 1 until multiple workers are permitted. */
- workers_actual = 1; // 16;
+ /* The GCC GCN back end is limited to num_workers (16). */
+ workers_actual = 16;
}
else
__builtin_abort ();
@@ -465,8 +463,6 @@ int main ()
}
else if (acc_on_device (acc_device_radeon))
{
- /* Temporary setting, until multiple workers are permitted. */
- workers_actual = 1;
/* See above comments about GCN vectors_actual. */
vectors_actual = 1;
}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
index 624ec24e437..4f88b1c0779 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
@@ -5,12 +5,13 @@
#include <openacc.h>
#include <gomp-constants.h>
+#define NUM_WORKERS 16
#ifdef ACC_DEVICE_TYPE_radeon
-/* Temporarily set this to 1 until multiple workers are permitted. */
-#define NUM_WORKERS 1
+/* AMD GCN uses the autovectorizer for the vector dimension: the use
+ of a function call in vector-partitioned code in this test is not
+ currently supported. */
#define NUM_VECTORS 1
#else
-#define NUM_WORKERS 16
#define NUM_VECTORS 32
#endif
#define WIDTH 64
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
index 29f92c0d4c3..69b69b66c71 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
@@ -4,6 +4,9 @@
! { dg-do run }
+!TODO
+! { dg-xfail-run-if TODO { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } }
+
program optional_reduction
implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90
index 8cffac93a22..a8b0c60e420 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90
@@ -1,5 +1,8 @@
! { dg-do run }
+!TODO
+! { dg-xfail-run-if TODO { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } }
+
! subroutine reduction with private and firstprivate variables
program reduction
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2021-08-09 13:20 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-09 13:20 [gcc r12-2819] amdgcn: Enable OpenACC worker partitioning for AMD GCN Thomas Schwinge
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).