From: Nathan Sidwell <nathan@acm.org>
To: Jakub Jelinek <jakub@redhat.com>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>
Subject: [Openacc] Adjust automatic loop partitioning
Date: Fri, 29 Apr 2016 14:01:00 -0000 [thread overview]
Message-ID: <b0e06bc4-1414-6858-5ca9-a0782fa70094@acm.org> (raw)
[-- Attachment #1: Type: text/plain, Size: 571 bytes --]
Jakub,
currently automatic loop partitioning assigns from the innermost loop outwards
-- that was the simplest thing to implement. A better algorithm is to assign
the outermost loop to the outermost available axis, and then assign from the
innermost loop outwards. That way we (generally) get gang partitioning on the
outermost loop. Just inside that we'll get non-partitioned loops if the nest is
too deep, and the two innermost nested loops will get worker and vector
partitioning.
This patch has been on the gomp4 branch for a while. ok for trunk?
nathan
[-- Attachment #2: trunk-gang.patch --]
[-- Type: text/x-patch, Size: 7883 bytes --]
2016-04-29 Nathan Sidwell <nathan@codesourcery.com>
gcc/
* omp-low.c (struct oacc_loop): Add 'inner' field.
(new_oacc_loop_raw): Initialize it to zero.
(oacc_loop_fixed_partitions): Initialize it.
(oacc_loop_auto_partitions): Partition outermost loop to outermost
available partitioning.
gcc/testsuite/
* c-c++-common/goacc/loop-auto-1.c: Adjust expected warnings.
libgomp/
* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust
expected partitioning.
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c (revision 235511)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c (working copy)
@@ -103,9 +103,11 @@ int vector_1 (int *ary, int size)
#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
+#pragma acc loop gang
+ for (int jx = 0; jx < 1; jx++)
#pragma acc loop auto
- for (int ix = 0; ix < size; ix++)
- ary[ix] = place ();
+ for (int ix = 0; ix < size; ix++)
+ ary[ix] = place ();
}
return check (ary, size, 0, 0, 1);
@@ -118,7 +120,7 @@ int vector_2 (int *ary, int size)
#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
#pragma acc loop worker
- for (int jx = 0; jx < size / 64; jx++)
+ for (int jx = 0; jx < size / 64; jx++)
#pragma acc loop auto
for (int ix = 0; ix < 64; ix++)
ary[ix + jx * 64] = place ();
@@ -133,30 +135,16 @@ int worker_1 (int *ary, int size)
#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
+#pragma acc loop gang
+ for (int kx = 0; kx < 1; kx++)
#pragma acc loop auto
- for (int jx = 0; jx < size / 64; jx++)
+ for (int jx = 0; jx < size / 64; jx++)
#pragma acc loop vector
- for (int ix = 0; ix < 64; ix++)
- ary[ix + jx * 64] = place ();
- }
-
- return check (ary, size, 0, 1, 1);
-}
-
-int worker_2 (int *ary, int size)
-{
- clear (ary, size);
-
-#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
- {
-#pragma acc loop auto
- for (int jx = 0; jx < size / 64; jx++)
-#pragma acc loop auto
- for (int ix = 0; ix < 64; ix++)
- ary[ix + jx * 64] = place ();
+ for (int ix = 0; ix < 64; ix++)
+ ary[ix + jx * 64] = place ();
}
- return check (ary, size, 0, 1, 1);
+ return check (ary, size, 0, 1, 1);
}
int gang_1 (int *ary, int size)
@@ -193,6 +181,22 @@ int gang_2 (int *ary, int size)
return check (ary, size, 1, 1, 1);
}
+int gang_3 (int *ary, int size)
+{
+ clear (ary, size);
+
+#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop auto
+ for (int jx = 0; jx < size / 64; jx++)
+#pragma acc loop auto
+ for (int ix = 0; ix < 64; ix++)
+ ary[ix + jx * 64] = place ();
+ }
+
+ return check (ary, size, 1, 0, 1);
+}
+
#define N (32*32*32)
int main ()
{
@@ -214,13 +218,13 @@ int main ()
if (worker_1 (ary, N))
return 1;
- if (worker_2 (ary, N))
- return 1;
if (gang_1 (ary, N))
return 1;
if (gang_2 (ary, N))
return 1;
+ if (gang_3 (ary, N))
+ return 1;
return 0;
}
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c (revision 235511)
+++ gcc/omp-low.c (working copy)
@@ -241,6 +241,7 @@ struct oacc_loop
tree routine; /* Pseudo-loop enclosing a routine. */
unsigned mask; /* Partitioning mask. */
+ unsigned inner; /* Partitioning of inner loops. */
unsigned flags; /* Partitioning flags. */
unsigned ifns; /* Contained loop abstraction functions. */
tree chunk_size; /* Chunk size. */
@@ -18921,7 +18922,7 @@ new_oacc_loop_raw (oacc_loop *parent, lo
memset (loop->tails, 0, sizeof (loop->tails));
loop->routine = NULL_TREE;
- loop->mask = loop->flags = 0;
+ loop->mask = loop->flags = loop->inner = 0;
loop->ifns = 0;
loop->chunk_size = 0;
loop->head_end = NULL;
@@ -19449,8 +19450,11 @@ oacc_loop_fixed_partitions (oacc_loop *l
mask_all |= this_mask;
if (loop->child)
- mask_all |= oacc_loop_fixed_partitions (loop->child,
- outer_mask | this_mask);
+ {
+ loop->inner = oacc_loop_fixed_partitions (loop->child,
+ outer_mask | this_mask);
+ mask_all |= loop->inner;
+ }
if (loop->sibling)
mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
@@ -19466,7 +19470,7 @@ oacc_loop_fixed_partitions (oacc_loop *l
static unsigned
oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
{
- unsigned inner_mask = 0;
+ bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
bool noisy = true;
#ifdef ACCEL_COMPILER
@@ -19475,16 +19479,33 @@ oacc_loop_auto_partitions (oacc_loop *lo
noisy = false;
#endif
+ if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
+ {
+ /* Allocate the outermost loop at the outermost available
+ level. */
+ unsigned this_mask = outer_mask + 1;
+
+ if (!(this_mask & loop->inner))
+ loop->mask = this_mask;
+ }
+
if (loop->child)
- inner_mask |= oacc_loop_auto_partitions (loop->child,
- outer_mask | loop->mask);
+ {
+ unsigned child_mask = outer_mask | loop->mask;
+
+ if (loop->mask || assign)
+ child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
- if ((loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT))
+ loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
+ }
+
+ if (assign && !loop->mask)
{
+ /* Allocate the loop at the innermost available level. */
unsigned this_mask = 0;
/* Determine the outermost partitioning used within this loop. */
- this_mask = inner_mask | GOMP_DIM_MASK (GOMP_DIM_MAX);
+ this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
this_mask = (this_mask & -this_mask);
/* Pick the partitioning just inside that one. */
@@ -19497,17 +19518,20 @@ oacc_loop_auto_partitions (oacc_loop *lo
warning_at (loop->loc, 0,
"insufficient partitioning available to parallelize loop");
- if (dump_file)
- fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
- LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
- this_mask);
-
loop->mask = this_mask;
}
- inner_mask |= loop->mask;
+
+ if (assign && dump_file)
+ fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
+ LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
+ loop->mask);
+
+ unsigned inner_mask = 0;
if (loop->sibling)
inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+
+ inner_mask |= loop->inner | loop->mask;
return inner_mask;
}
Index: gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
===================================================================
--- gcc/testsuite/c-c++-common/goacc/loop-auto-1.c (revision 235511)
+++ gcc/testsuite/c-c++-common/goacc/loop-auto-1.c (working copy)
@@ -186,10 +186,10 @@ void Worker (void)
for (int jx = 0; jx < 10; jx++) {}
}
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto
for (int ix = 0; ix < 10; ix++)
{
-#pragma acc loop auto
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
@@ -214,10 +214,10 @@ void Vector (void)
#pragma acc loop auto
for (int ix = 0; ix < 10; ix++) {}
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto
for (int ix = 0; ix < 10; ix++)
{
-#pragma acc loop auto
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++) {}
}
}
next prev reply other threads:[~2016-04-29 14:01 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-01-22 22:11 [gomp4] gang partitioning Nathan Sidwell
2016-01-22 21:59 ` [gomp4] fix atomic tests Nathan Sidwell
2016-04-29 14:01 ` Nathan Sidwell [this message]
2016-05-02 7:15 ` [Openacc] Adjust automatic loop partitioning Jakub Jelinek
2016-05-03 10:35 ` Thomas Schwinge
2016-05-04 17:25 ` [PATCH] tail merge ICE Nathan Sidwell
2016-05-06 10:32 ` Richard Biener
2016-06-10 10:25 ` Thomas Schwinge
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=b0e06bc4-1414-6858-5ca9-a0782fa70094@acm.org \
--to=nathan@acm.org \
--cc=gcc-patches@gcc.gnu.org \
--cc=jakub@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).