public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [gomp4] gang partitioning
@ 2016-01-22 22:11 Nathan Sidwell
  2016-01-22 21:59 ` [gomp4] fix atomic tests Nathan Sidwell
  0 siblings, 1 reply; 8+ messages in thread
From: Nathan Sidwell @ 2016-01-22 22:11 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 492 bytes --]

I've committed this patch to gomp4 branch.  It changes the auto partitioning 
logic to allocate the outermost loop to the outermost available partitioning. 
For instance, gang partitioning will be used for the outermost loop of a 
parallel region.   Innermost loops remain partitioned at the  innermost 
available level.

This means that if we run out of available partitions, we've parallelized the 
outer loop and the innermost loops, rather than just parallelized the inner loops.

nathan

[-- Attachment #2: gomp4-auto-gang.patch --]
[-- Type: text/x-patch, Size: 7954 bytes --]

2016-01-22  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* omp-low.c (struct oacc_loop): Add 'inner' field.
	(new_oacc_loop_raw): Initialize it to zero.
	(oacc_loop_fixed_partitions): Initialize it.
	(oacc_loop_auto_partitions): Partition outermost loop to outermost
	available partitioning.

	gcc/testsuite/
	* c-c++-common/goacc/loop-auto-1.c: Adjust expected warnings.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust
	expected partitioning.

Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c	(revision 232749)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c	(working copy)
@@ -102,9 +102,11 @@ int vector_1 (int *ary, int size)
   
 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
+#pragma acc loop gang
+    for (int jx = 0; jx < 1; jx++)
 #pragma acc loop auto
-    for (int ix = 0; ix < size; ix++)
-      ary[ix] = place ();
+      for (int ix = 0; ix < size; ix++)
+	ary[ix] = place ();
   }
 
   return check (ary, size, 0, 0, 1);
@@ -117,7 +119,7 @@ int vector_2 (int *ary, int size)
 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
 #pragma acc loop worker
-    for (int jx = 0; jx <  size  / 64; jx++)
+    for (int jx = 0; jx < size  / 64; jx++)
 #pragma acc loop auto
       for (int ix = 0; ix < 64; ix++)
 	ary[ix + jx * 64] = place ();
@@ -132,30 +134,16 @@ int worker_1 (int *ary, int size)
   
 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
+#pragma acc loop gang
+    for (int kx = 0; kx < 1; kx++)
 #pragma acc loop auto
-    for (int jx = 0; jx <  size  / 64; jx++)
+      for (int jx = 0; jx <  size  / 64; jx++)
 #pragma acc loop vector
-      for (int ix = 0; ix < 64; ix++)
-	ary[ix + jx * 64] = place ();
-  }
-
-  return check (ary, size, 0, 1, 1);
-}
-
-int worker_2 (int *ary, int size)
-{
-  clear (ary, size);
-  
-#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
-  {
-#pragma acc loop auto
-    for (int jx = 0; jx <  size  / 64; jx++)
-#pragma acc loop auto
-      for (int ix = 0; ix < 64; ix++)
-	ary[ix + jx * 64] = place ();
+	for (int ix = 0; ix < 64; ix++)
+	  ary[ix + jx * 64] = place ();
   }
 
-  return check (ary, size, 0, 1, 1);
+  return check (ary, size, 0,  1, 1);
 }
 
 int gang_1 (int *ary, int size)
@@ -192,6 +180,22 @@ int gang_2 (int *ary, int size)
   return check (ary, size, 1, 1, 1);
 }
 
+int gang_3 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int jx = 0; jx <  size  / 64; jx++)
+#pragma acc loop auto
+      for (int ix = 0; ix < 64; ix++)
+	ary[ix + jx * 64] = place ();
+  }
+
+  return check (ary, size, 1, 0, 1);
+}
+
 #define N (32*32*32)
 int main ()
 {
@@ -213,13 +217,13 @@ int main ()
 
   if (worker_1 (ary,  N))
     return 1;
-  if (worker_2 (ary,  N))
-    return 1;
   
   if (gang_1 (ary,  N))
     return 1;
   if (gang_2 (ary,  N))
     return 1;
+  if (gang_3 (ary,  N))
+    return 1;
 
   return 0;
 }
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 232749)
+++ gcc/omp-low.c	(working copy)
@@ -249,8 +249,9 @@ struct oacc_loop
   tree routine;  /* Pseudo-loop enclosing a routine.  */
 
   unsigned mask;   /* Partitioning mask.  */
-  unsigned flags;   /* Partitioning flags.  */
-  tree chunk_size;   /* Chunk size.  */
+  unsigned inner;  /* Partitioning of inner loops.  */
+  unsigned flags;  /* Partitioning flags.  */
+  tree chunk_size; /* Chunk size.  */
   gcall *head_end; /* Final marker of head sequence.  */
 };
 
@@ -19434,7 +19435,7 @@ new_oacc_loop_raw (oacc_loop *parent, lo
   memset (loop->tails, 0, sizeof (loop->tails));
   loop->routine = NULL_TREE;
 
-  loop->mask = loop->flags = 0;
+  loop->mask = loop->flags = loop->inner = 0;
   loop->chunk_size = 0;
   loop->head_end = NULL;
 
@@ -19941,8 +19942,11 @@ oacc_loop_fixed_partitions (oacc_loop *l
   mask_all |= this_mask;
   
   if (loop->child)
-    mask_all |= oacc_loop_fixed_partitions (loop->child,
-					    outer_mask | this_mask);
+    {
+      loop->inner = oacc_loop_fixed_partitions (loop->child,
+						outer_mask | this_mask); 
+      mask_all |= loop->inner;
+    }
 
   if (loop->sibling)
     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
@@ -19958,7 +19962,7 @@ oacc_loop_fixed_partitions (oacc_loop *l
 static unsigned
 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
 {
-  unsigned inner_mask = 0;
+  bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
   bool noisy = true;
 
 #ifdef ACCEL_COMPILER
@@ -19967,16 +19971,33 @@ oacc_loop_auto_partitions (oacc_loop *lo
   noisy = false;
 #endif
 
+  if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
+    {
+      /* Allocate the outermost loop at the outermost available
+	 level.  */
+      unsigned this_mask = outer_mask + 1;
+
+      if (!(this_mask & loop->inner))
+	loop->mask = this_mask;
+    }
+
   if (loop->child)
-    inner_mask |= oacc_loop_auto_partitions (loop->child,
-					     outer_mask | loop->mask);
+    {
+      unsigned child_mask = outer_mask | loop->mask;
+
+      if (loop->mask || assign)
+	child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
 
-  if ((loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT))
+      loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
+    }
+
+  if (assign && !loop->mask)
     {
+      /* Allocate the loop at the innermost available level.  */
       unsigned this_mask = 0;
       
       /* Determine the outermost partitioning used within this loop. */
-      this_mask = inner_mask | GOMP_DIM_MASK (GOMP_DIM_MAX);
+      this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
       this_mask = (this_mask & -this_mask);
 
       /* Pick the partitioning just inside that one.  */
@@ -19989,17 +20010,20 @@ oacc_loop_auto_partitions (oacc_loop *lo
 	warning_at (loop->loc, 0,
 		    "insufficient partitioning available to parallelize loop");
 
-      if (dump_file)
-	fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
-		 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
-		 this_mask);
-
       loop->mask = this_mask;
     }
-  inner_mask |= loop->mask;
+
+  if (assign && dump_file)
+    fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
+	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
+	     loop->mask);
+
+  unsigned inner_mask = 0;
   
   if (loop->sibling)
     inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+  
+  inner_mask |= loop->inner | loop->mask;
 
   return inner_mask;
 }
Index: gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
===================================================================
--- gcc/testsuite/c-c++-common/goacc/loop-auto-1.c	(revision 232749)
+++ gcc/testsuite/c-c++-common/goacc/loop-auto-1.c	(working copy)
@@ -186,10 +186,10 @@ void Worker (void)
 	for (int jx = 0; jx < 10; jx++) {}
       }
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++)
 	  {
 #pragma acc loop auto
@@ -214,10 +214,10 @@ void Vector (void)
 #pragma acc loop auto
     for (int ix = 0; ix < 10; ix++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++) {}
       }
 }

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-06-10 10:25 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-22 22:11 [gomp4] gang partitioning Nathan Sidwell
2016-01-22 21:59 ` [gomp4] fix atomic tests Nathan Sidwell
2016-04-29 14:01   ` [Openacc] Adjust automatic loop partitioning Nathan Sidwell
2016-05-02  7:15     ` Jakub Jelinek
2016-05-03 10:35     ` Thomas Schwinge
2016-05-04 17:25       ` [PATCH] tail merge ICE Nathan Sidwell
2016-05-06 10:32         ` Richard Biener
2016-06-10 10:25         ` Thomas Schwinge

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).