public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [gomp4.1] Doacross library implementation
@ 2015-09-24 20:04 Jakub Jelinek
  2015-09-29 18:33 ` [gomp4.1] Fixup handling of doacross loops with noreturn body Jakub Jelinek
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Jakub Jelinek @ 2015-09-24 20:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: Torvald Riegel, Aldy Hernandez, Richard Henderson

Hi!

This patch implements DOACROSS in the library, so far only as busy waiting
and even without exponential (or some guess based on distance) backoff.
Torvald, can you please have a look at it, if I got all the atomics / memory
models right?  The testcase obviously is not a good benchmark, we'll need
some more realistic one.  But obviously when asking for oversubscription, it
is quite expensive.  The question is how to implement a non-busy waiting
fallback, whether we put some mutex and queue guarded by the mutex into the
same (or some other?) cache-line, or just use atomics to queue it and how to
make it cheap for the case where busy waiting is sufficient.  I'd say
it should be sufficient to implement non-busy waiting in the flattened
variant.

As for the compiler side, I'll first adjust for the pending ticket (which
changes meaning of the ordered(n) clause if collapse(m) m > 1 is present),
then there is a bug with ordered loops that have noreturn body (need to add
some edge for that case and condition checking), lastprivate also needs
checking for all the cases, and finally more thinking on the conservative
dependence folding, where there are just too many issues unresolved right
now.

2015-09-24  Jakub Jelinek  <jakub@redhat.com>

	* gimplify.c (gimplify_omp_for): Don't adjust lastprivate
	on ordered loops above collapse.
	* omp-low.c (expand_omp_ordered_source): Rewritten to pass
	address of an array of indices.
	(expand_omp_ordered_source_sink): Create the VAR_DECL for it.
	(expand_omp_for_ordered_loops): Initialize and update the
	array elements.
	(expand_omp_for_generic): Likewise.  Move counts array one
	element back, so that collapsed loops are multiplied by correct
	counts.
	(lower_omp_ordered): Avoid the conservative dependence folding
	for now, it has too many issues.
	* omp-builtins.def (BUILT_IN_GOMP_DOACROSS_POST): Change
	type to BT_FN_VOID_PTR.
gcc/testsuite/
	* gcc.dg/gomp/sink-fold-1.c: Xfail.
	* gcc.dg/gomp/sink-fold-2.c: Likewise.
libgomp/
	* ordered.c: Include string.h and doacross.h.
	(gomp_doacross_init): New function.
	(GOMP_doacross_wait): Implement.
	(GOMP_doacross_post): Likewise.  Change arguments to
	pointer to long array.
	* loop.c (gomp_loop_doacross_static_start,
	gomp_loop_doacross_dynamic_start,
	gomp_loop_doacross_guided_start): Call gomp_doacross_init.
	* libgomp_g.h (GOMP_doacross_post): Adjust prototype.
	* libgomp.h (struct gomp_doacross_work_share): New type.
	(struct gomp_work_share): Put ordered_team_ids into anonymous
	union with new doacross field.
	* config/linux/doacross.h: New file.
	* config/posix/doacross.h: New file.
	* testsuite/libgomp.c/doacross-1.c: New test.

--- gcc/gimplify.c.jj	2015-09-18 18:38:17.000000000 +0200
+++ gcc/gimplify.c	2015-09-24 19:05:43.607556246 +0200
@@ -7788,6 +7788,10 @@ gimplify_omp_for (tree *expr_p, gimple_s
 						 (OMP_FOR_INIT (for_stmt))
 					       * 2);
     }
+  int collapse = 1;
+  c = find_omp_clause (OMP_FOR_CLAUSES (for_stmt), OMP_CLAUSE_COLLAPSE);
+  if (c)
+    collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c));
   for (i = 0; i < TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)); i++)
     {
       t = TREE_VEC_ELT (OMP_FOR_INIT (for_stmt), i);
@@ -8104,8 +8108,9 @@ gimplify_omp_for (tree *expr_p, gimple_s
 	  OMP_CLAUSE_LINEAR_STEP (c2) = OMP_CLAUSE_LINEAR_STEP (c);
 	}
 
-      if ((var != decl || TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)) > 1)
-	  && orig_for_stmt == for_stmt)
+      if ((var != decl || collapse > 1)
+	  && orig_for_stmt == for_stmt
+	  && i < collapse)
 	{
 	  for (c = OMP_FOR_CLAUSES (for_stmt); c ; c = OMP_CLAUSE_CHAIN (c))
 	    if (((OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE
--- gcc/omp-low.c.jj	2015-09-18 18:38:17.000000000 +0200
+++ gcc/omp-low.c	2015-09-24 18:06:31.174495644 +0200
@@ -7071,26 +7071,11 @@ static void
 expand_omp_ordered_source (gimple_stmt_iterator *gsi, struct omp_for_data *fd,
 			   tree *counts, location_t loc)
 {
-  auto_vec<tree, 10> args;
   enum built_in_function source_ix = BUILT_IN_GOMP_DOACROSS_POST;
-  tree t;
-  int i;
-
-  for (i = fd->collapse - 1; i < fd->collapse + fd->ordered - 1; i++)
-    if (i == fd->collapse - 1 && fd->collapse > 1)
-      args.quick_push (fd->loop.v);
-    else if (counts[i])
-      args.safe_push (counts[i]);
-    else
-      {
-	t = fold_build2_loc (loc, MINUS_EXPR, TREE_TYPE (fd->loops[i].v),
-			     fd->loops[i].v, fd->loops[i].n1);
-	t = fold_convert_loc (loc, fd->iter_type, t);
-	t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE,
-				      true, GSI_SAME_STMT);
-	args.safe_push (t);
-      }
-  gimple g = gimple_build_call_vec (builtin_decl_explicit (source_ix), args);
+  gimple g
+    = gimple_build_call (builtin_decl_explicit (source_ix), 1,
+			 build_fold_addr_expr (counts[fd->collapse
+						      + fd->ordered - 1]));
   gimple_set_location (g, loc);
   gsi_insert_before (gsi, g, GSI_SAME_STMT);
 }
@@ -7276,6 +7261,9 @@ expand_omp_ordered_source_sink (struct o
       counts[i] = NULL_TREE;
     else
       counts[i] = create_tmp_var (fd->iter_type, ".orditer");
+  tree atype = build_array_type_nelts (fd->iter_type, fd->ordered);
+  counts[fd->collapse + fd->ordered - 1] = create_tmp_var (atype, ".orditera");
+  TREE_ADDRESSABLE (counts[fd->collapse + fd->ordered - 1]) = 1;
 
   for (inner = region->inner; inner; inner = inner->next)
     if (inner->type == GIMPLE_OMP_ORDERED)
@@ -7315,6 +7303,11 @@ expand_omp_for_ordered_loops (struct omp
 	  tree type = TREE_TYPE (fd->loops[i].v);
 	  tree n1 = fold_convert (type, fd->loops[i].n1);
 	  expand_omp_build_assign (&gsi, fd->loops[i].v, n1);
+	  tree aref = build4 (ARRAY_REF, fd->iter_type,
+			      counts[fd->collapse + fd->ordered - 1],
+			      size_int (i - fd->collapse + 1),
+			      NULL_TREE, NULL_TREE);
+	  expand_omp_build_assign (&gsi, aref, build_zero_cst (fd->iter_type));
 	}
       return NULL;
     }
@@ -7328,6 +7321,11 @@ expand_omp_for_ordered_loops (struct omp
       if (counts[i])
 	expand_omp_build_assign (&gsi, counts[i],
 				 build_zero_cst (fd->iter_type));
+      tree aref = build4 (ARRAY_REF, fd->iter_type,
+			  counts[fd->collapse + fd->ordered - 1],
+			  size_int (i - fd->collapse + 1),
+			  NULL_TREE, NULL_TREE);
+      expand_omp_build_assign (&gsi, aref, build_zero_cst (fd->iter_type));
       if (!gsi_end_p (gsi))
 	gsi_prev (&gsi);
       else
@@ -7350,7 +7348,20 @@ expand_omp_for_ordered_loops (struct omp
 	  t = fold_build2 (PLUS_EXPR, fd->iter_type, counts[i],
 			   build_int_cst (fd->iter_type, 1));
 	  expand_omp_build_assign (&gsi, counts[i], t);
+	  t = counts[i];
 	}
+      else
+	{
+	  t = fold_build2 (MINUS_EXPR, TREE_TYPE (fd->loops[i].v),
+			   fd->loops[i].v, fd->loops[i].n1);
+	  t = fold_convert (fd->iter_type, t);
+	  t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+					true, GSI_SAME_STMT);
+	}
+      aref = build4 (ARRAY_REF, fd->iter_type,
+		     counts[fd->collapse + fd->ordered - 1],
+		     size_int (i - fd->collapse + 1), NULL_TREE, NULL_TREE);
+      expand_omp_build_assign (&gsi, aref, t);
       gsi_prev (&gsi);
       edge e2 = split_block (cont_bb, gsi_stmt (gsi));
       basic_block new_header = e2->dest;
@@ -7525,7 +7536,7 @@ expand_omp_for_generic (struct omp_regio
       basic_block zero_iter1_bb = NULL, zero_iter2_bb = NULL, l2_dom_bb = NULL;
 
       counts = XALLOCAVEC (tree, fd->collapse
-				 + (fd->ordered ? fd->ordered - 1 : 0));
+				 + (fd->ordered ? fd->ordered - 1 + 1 : 0));
       expand_omp_for_init_counts (fd, &gsi, entry_bb, counts,
 				  zero_iter1_bb, first_zero_iter1,
 				  zero_iter2_bb, first_zero_iter2, l2_dom_bb);
@@ -7910,17 +7921,50 @@ expand_omp_for_generic (struct omp_regio
     expand_omp_for_init_vars (fd, &gsi, counts, inner_stmt, startvar);
 
   if (fd->ordered)
-    expand_omp_ordered_source_sink (region, fd, counts, cont_bb);
-  cont_bb = expand_omp_for_ordered_loops (fd, counts, cont_bb, l1_bb);
-  if (fd->ordered && counts[fd->collapse - 1])
     {
-      gcc_assert (fd->collapse == 1);
+      /* Until now, counts array contained number of iterations or
+	 variable containing it for ith loop.  From now on, we need
+	 those counts only for collapsed loops, and only for the 2nd
+	 till the last collapsed one.  Move those one element earlier,
+	 we'll use counts[fd->collapse - 1] for the first source/sink
+	 iteration counter and so on and counts[fd->collapse + fd->ordered - 1]
+	 as the array holding the current counter values for
+	 depend(source).  */
+      if (fd->collapse > 1)
+	memmove (counts, counts + 1, (fd->collapse - 1) * sizeof (counts[0]));
+      expand_omp_ordered_source_sink (region, fd, counts, cont_bb);
+      cont_bb = expand_omp_for_ordered_loops (fd, counts, cont_bb, l1_bb);
+      if (counts[fd->collapse - 1])
+	{
+	  gcc_assert (fd->collapse == 1);
+	  gsi = gsi_last_bb (l0_bb);
+	  expand_omp_build_assign (&gsi, counts[fd->collapse - 1],
+				   istart0, true);
+	  gsi = gsi_last_bb (cont_bb);
+	  t = fold_build2 (PLUS_EXPR, fd->iter_type, counts[fd->collapse - 1],
+			   build_int_cst (fd->iter_type, 1));
+	  expand_omp_build_assign (&gsi, counts[fd->collapse - 1], t);
+	  tree aref = build4 (ARRAY_REF, fd->iter_type,
+			      counts[fd->collapse + fd->ordered - 1],
+			      size_zero_node, NULL_TREE, NULL_TREE);
+	  expand_omp_build_assign (&gsi, aref, counts[fd->collapse - 1]);
+	  t = counts[fd->collapse - 1];
+	}
+      else if (fd->collapse > 1)
+	t = fd->loop.v;
+      else
+	{
+	  t = fold_build2 (MINUS_EXPR, TREE_TYPE (fd->loops[0].v),
+			   fd->loops[0].v, fd->loops[0].n1);
+	  t = fold_convert (fd->iter_type, t);
+	}
       gsi = gsi_last_bb (l0_bb);
-      expand_omp_build_assign (&gsi, counts[fd->collapse - 1], istart0, true);
-      gsi = gsi_last_bb (cont_bb);
-      t = fold_build2 (PLUS_EXPR, fd->iter_type, counts[fd->collapse - 1],
-		       build_int_cst (fd->iter_type, 1));
-      expand_omp_build_assign (&gsi, counts[fd->collapse - 1], t);
+      tree aref = build4 (ARRAY_REF, fd->iter_type,
+			  counts[fd->collapse + fd->ordered - 1],
+			  size_zero_node, NULL_TREE, NULL_TREE);
+      t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+				    false, GSI_CONTINUE_LINKING);
+      expand_omp_build_assign (&gsi, aref, t, true);
     }
 
   if (!broken_loop)
@@ -7946,6 +7990,24 @@ expand_omp_for_generic (struct omp_regio
 	  assign_stmt = gimple_build_assign (vback, t);
 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
 
+	  if (fd->ordered && counts[fd->collapse - 1] == NULL_TREE)
+	    {
+	      if (fd->collapse > 1)
+		t = fd->loop.v;
+	      else
+		{
+		  t = fold_build2 (MINUS_EXPR, TREE_TYPE (fd->loops[0].v),
+				   fd->loops[0].v, fd->loops[0].n1);
+		  t = fold_convert (fd->iter_type, t);
+		}
+	      tree aref = build4 (ARRAY_REF, fd->iter_type,
+				  counts[fd->collapse + fd->ordered - 1],
+				  size_zero_node, NULL_TREE, NULL_TREE);
+	      t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+					    true, GSI_SAME_STMT);
+	      expand_omp_build_assign (&gsi, aref, t);
+	    }
+
 	  t = build2 (fd->loop.cond_code, boolean_type_node,
 		      DECL_P (vback) && TREE_ADDRESSABLE (vback) ? t : vback,
 		      iend);
@@ -7986,6 +8048,14 @@ expand_omp_for_generic (struct omp_regio
   if (gimple_omp_return_lhs (gsi_stmt (gsi)))
     gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
   gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
+  if (fd->ordered)
+    {
+      tree arr = counts[fd->collapse + fd->ordered - 1];
+      tree clobber = build_constructor (TREE_TYPE (arr), NULL);
+      TREE_THIS_VOLATILE (clobber) = 1;
+      gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
+			GSI_SAME_STMT);
+    }
   gsi_remove (&gsi, true);
 
   /* Connect the new blocks.  */
@@ -12801,7 +12871,12 @@ lower_omp_ordered (gimple_stmt_iterator
   if (find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
 		       OMP_CLAUSE_DEPEND))
     {
-      lower_omp_ordered_clauses (gsi_p, ord_stmt, ctx);
+      /* FIXME: This is needs to be moved to the expansion to verify various
+	 conditions only testable on cfg with dominators computed, and also
+	 all the depend clauses to be merged still might need to be available
+	 for the runtime checks.  */
+      if (0)
+	lower_omp_ordered_clauses (gsi_p, ord_stmt, ctx);
       return;
     }
 
--- gcc/omp-builtins.def.jj	2015-09-17 09:23:39.000000000 +0200
+++ gcc/omp-builtins.def	2015-09-24 13:33:02.542783166 +0200
@@ -247,7 +247,7 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_ORDERED_
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_ORDERED_END, "GOMP_ordered_end",
 		  BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_POST, "GOMP_doacross_post",
-		  BT_FN_VOID_LONG_VAR, ATTR_NOTHROW_LEAF_LIST)
+		  BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_WAIT, "GOMP_doacross_wait",
 		  BT_FN_VOID_LONG_VAR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_PARALLEL, "GOMP_parallel",
--- gcc/testsuite/gcc.dg/gomp/sink-fold-1.c.jj	2015-09-18 18:34:26.000000000 +0200
+++ gcc/testsuite/gcc.dg/gomp/sink-fold-1.c	2015-09-24 19:35:17.463625989 +0200
@@ -28,4 +28,4 @@ funk ()
     }
 }
 
-/* { dg-final { scan-tree-dump-times "omp ordered depend\\(sink:i-2,j-2,k\\+2\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "omp ordered depend\\(sink:i-2,j-2,k\\+2\\)" 1 "omplower" { xfail *-*-* } } } */
--- gcc/testsuite/gcc.dg/gomp/sink-fold-2.c.jj	2015-08-24 14:32:06.000000000 +0200
+++ gcc/testsuite/gcc.dg/gomp/sink-fold-2.c	2015-09-24 19:37:06.054038766 +0200
@@ -11,8 +11,8 @@ funk ()
   for (i=0; i < N; i += 3)
     for (j=0; j < N; ++j)
     {
-#pragma omp ordered depend(sink:i-8,j-1) /* { dg-warning "ignoring sink clause with offset that is not a multiple" } */
-#pragma omp ordered depend(sink:i+3,j-1) /* { dg-error "first offset must be in opposite direction" } */
+#pragma omp ordered depend(sink:i-8,j-1) /* { dg-warning "ignoring sink clause with offset that is not a multiple" "" { xfail *-*-* } } */
+#pragma omp ordered depend(sink:i+3,j-1) /* { dg-error "first offset must be in opposite direction" "" { xfail *-*-* } } */
         bar();
 #pragma omp ordered depend(source)
     }
--- libgomp/ordered.c.jj	2015-09-18 18:36:42.000000000 +0200
+++ libgomp/ordered.c	2015-09-24 18:20:28.286244397 +0200
@@ -27,6 +27,8 @@
 
 #include "libgomp.h;"
 #include <stdarg.h>
+#include <string.h>
+#include "doacross.h"
 
 
 /* This function is called when first allocating an iteration block.  That
@@ -252,14 +254,146 @@ GOMP_ordered_end (void)
 {
 }
 
+/* DOACROSS initialization.  */
+
+#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
+
+void
+gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_work_share *ws = thr->ts.work_share;
+  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+  unsigned long ent, num_ents, elt_sz, shift_sz;
+  struct gomp_doacross_work_share *doacross;
+
+  if (team == NULL || team->nthreads == 1)
+    return;
+
+  for (i = 0; i < ncounts; i++)
+    {
+      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+      if (counts[i] == 0)
+	return;
+
+      if (num_bits <= MAX_COLLAPSED_BITS)
+	{
+	  unsigned int this_bits;
+	  if (counts[i] == 1)
+	    this_bits = 1;
+	  else
+	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
+			- __builtin_clzl (counts[i] - 1);
+	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+	    {
+	      bits[i] = this_bits;
+	      num_bits += this_bits;
+	    }
+	  else
+	    num_bits = MAX_COLLAPSED_BITS + 1;
+	}
+    }
+
+  if (ws->sched == GFS_STATIC)
+    num_ents = team->nthreads;
+  else
+    num_ents = (counts[0] - 1) / chunk_size + 1;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      elt_sz = sizeof (unsigned long);
+      shift_sz = ncounts * sizeof (unsigned int);
+    }
+  else
+    {
+      elt_sz = sizeof (unsigned long) * ncounts;
+      shift_sz = 0;
+    }
+  elt_sz = (elt_sz + 63) & ~63UL;
+
+  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+			  + shift_sz);
+  doacross->chunk_size = chunk_size;
+  doacross->elt_sz = elt_sz;
+  doacross->ncounts = ncounts;
+  doacross->flattened = false;
+  doacross->boundary = 0;
+  doacross->array = (unsigned char *)
+		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+		     & ~(uintptr_t) 63);
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      unsigned int shift_count = 0;
+      doacross->flattened = true;
+      for (i = ncounts; i > 0; i--)
+	{
+	  doacross->shift_counts[i - 1] = shift_count;
+	  shift_count += bits[i - 1];
+	}
+      for (ent = 0; ent < num_ents; ent++)
+	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+    }
+  else
+    for (ent = 0; ent < num_ents; ent++)
+      memset (doacross->array + ent * elt_sz, '\0',
+	      sizeof (unsigned long) * ncounts);
+  if (ws->sched == GFS_STATIC && chunk_size == 0)
+    {
+      unsigned long q = counts[0] / num_ents;
+      unsigned long t = counts[0] % num_ents;
+      doacross->boundary = t * (q + 1);
+      doacross->q = q;
+      doacross->t = t;
+    }
+  ws->doacross = doacross;
+}
+
 /* DOACROSS POST operation.  */
 
 void
-GOMP_doacross_post (long first, ...)
+GOMP_doacross_post (long *counts)
 {
-  va_list ap;
-  va_start (ap, first);
-  va_end (ap);
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    ent = thr->ts.team_id;
+  else
+    ent = counts[0] / doacross->chunk_size;
+  unsigned long *array = (unsigned long *) (doacross->array
+					    + ent * doacross->elt_sz);
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long flattened
+	= (unsigned long) counts[0] << doacross->shift_counts[0];
+
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= (unsigned long) counts[i]
+		     << doacross->shift_counts[i];
+      flattened++;
+      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+	__atomic_thread_fence (MEMMODEL_RELEASE);
+      else
+	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+      return;
+    }
+
+  __atomic_thread_fence (MEMMODEL_ACQUIRE);
+  for (i = doacross->ncounts; i-- > 0; )
+    {
+      if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+    }
 }
 
 /* DOACROSS WAIT operation.  */
@@ -267,7 +401,81 @@ GOMP_doacross_post (long first, ...)
 void
 GOMP_doacross_wait (long first, ...)
 {
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
   va_list ap;
-  va_start (ap, first);
-  va_end (ap);
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    {
+      if (ws->chunk_size == 0)
+	{
+	  if (first < doacross->boundary)
+	    ent = first / (doacross->q + 1);
+	  else
+	    ent = (first - doacross->boundary) / doacross->q
+		  + doacross->t;
+	}
+      else
+	ent = first / ws->chunk_size % thr->ts.team->nthreads;
+    }
+  else
+    ent = first / doacross->chunk_size;
+  unsigned long *array = (unsigned long *) (doacross->array
+					    + ent * doacross->elt_sz);
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long flattened
+	= (unsigned long) first << doacross->shift_counts[0];
+      unsigned long cur;
+
+      va_start (ap, first);
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= (unsigned long) va_arg (ap, long)
+		     << doacross->shift_counts[i];
+      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+      if (flattened < cur)
+	{
+	  __atomic_thread_fence (MEMMODEL_RELEASE);
+	  va_end (ap);
+	  return;
+	}
+      doacross_spin (array, flattened, cur);
+      __atomic_thread_fence (MEMMODEL_RELEASE);
+      va_end (ap);
+      return;
+    }
+
+  do
+    {
+      va_start (ap, first);
+      for (i = 0; i < doacross->ncounts; i++)
+	{
+	  unsigned long thisv
+	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
+	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+	  if (thisv < cur)
+	    {
+	      i = doacross->ncounts;
+	      break;
+	    }
+	  if (thisv > cur)
+	    break;
+	}
+      va_end (ap);
+      if (i == doacross->ncounts)
+	break;
+      cpu_relax ();
+    }
+  while (1);
+  __sync_synchronize ();
 }
--- libgomp/loop.c.jj	2015-09-16 14:21:10.000000000 +0200
+++ libgomp/loop.c	2015-09-22 15:29:40.016583743 +0200
@@ -306,7 +306,7 @@ gomp_loop_doacross_static_start (unsigne
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_STATIC, chunk_size);
-      /* gomp_ordered_static_init (); */
+      gomp_doacross_init (ncounts, counts, chunk_size);
       gomp_work_share_init_done ();
     }
 
@@ -324,6 +324,7 @@ gomp_loop_doacross_dynamic_start (unsign
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_DYNAMIC, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size);
       gomp_work_share_init_done ();
     }
 
@@ -349,6 +350,7 @@ gomp_loop_doacross_guided_start (unsigne
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_GUIDED, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size);
       gomp_work_share_init_done ();
     }
 
--- libgomp/libgomp_g.h.jj	2015-09-17 09:25:23.000000000 +0200
+++ libgomp/libgomp_g.h	2015-09-24 13:33:32.726324481 +0200
@@ -177,7 +177,7 @@ extern bool GOMP_loop_ull_ordered_runtim
 
 extern void GOMP_ordered_start (void);
 extern void GOMP_ordered_end (void);
-extern void GOMP_doacross_post (long, ...);
+extern void GOMP_doacross_post (long *);
 extern void GOMP_doacross_wait (long, ...);
 
 /* parallel.c */
--- libgomp/libgomp.h.jj	2015-09-08 10:18:44.000000000 +0200
+++ libgomp/libgomp.h	2015-09-23 12:25:51.866847300 +0200
@@ -78,6 +78,36 @@ enum gomp_schedule_type
   GFS_AUTO
 };
 
+struct gomp_doacross_work_share
+{
+  union {
+    /* chunk_size copy, as ws->chunk_size is multiplied by incr for
+       GFS_DYNAMIC.  */
+    long chunk_size;
+    /* For schedule(static,0) this is the number
+       of iterations assigned to the last thread, i.e. number of
+       iterations / number of threads.  */
+    long q;
+  };
+  /* Size of each array entry (padded to cache line size).  */
+  unsigned long elt_sz;
+  /* Number of dimensions in sink vectors.  */
+  unsigned int ncounts;
+  /* True if the iterations can be flattened.  */
+  bool flattened;
+  /* Actual array (of elt_sz sized units), aligned to cache line size.
+     This is indexed by team_id for GFS_STATIC and outermost iteration
+     / chunk_size for other schedules.  */
+  unsigned char *array;
+  /* These two are only used for schedule(static,0).  */
+  /* This one is number of iterations % number of threads.  */
+  long t;
+  /* And this one is cached t * (q + 1).  */
+  long boundary;
+  /* Array of shift counts for each dimension if they can be flattened.  */
+  unsigned int shift_counts[];
+};
+
 struct gomp_work_share
 {
   /* This member records the SCHEDULE clause to be used for this construct.
@@ -109,13 +139,18 @@ struct gomp_work_share
     };
   };
 
-  /* This is a circular queue that details which threads will be allowed
-     into the ordered region and in which order.  When a thread allocates
-     iterations on which it is going to work, it also registers itself at
-     the end of the array.  When a thread reaches the ordered region, it
-     checks to see if it is the one at the head of the queue.  If not, it
-     blocks on its RELEASE semaphore.  */
-  unsigned *ordered_team_ids;
+  union {
+    /* This is a circular queue that details which threads will be allowed
+       into the ordered region and in which order.  When a thread allocates
+       iterations on which it is going to work, it also registers itself at
+       the end of the array.  When a thread reaches the ordered region, it
+       checks to see if it is the one at the head of the queue.  If not, it
+       blocks on its RELEASE semaphore.  */
+    unsigned *ordered_team_ids;
+
+    /* This is a pointer to DOACROSS work share data.  */
+    struct gomp_doacross_work_share *doacross;
+  };
 
   /* This is the number of threads that have registered themselves in
      the circular queue ordered_team_ids.  */
@@ -647,6 +682,7 @@ extern void gomp_ordered_next (void);
 extern void gomp_ordered_static_init (void);
 extern void gomp_ordered_static_next (void);
 extern void gomp_ordered_sync (void);
+extern void gomp_doacross_init (unsigned, long *, long);
 
 /* parallel.c */
 
--- libgomp/config/linux/doacross.h.jj	2015-09-23 12:10:32.039275447 +0200
+++ libgomp/config/linux/doacross.h	2015-09-23 12:33:03.861556163 +0200
@@ -0,0 +1,57 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a Linux specific implementation of doacross spinning.  */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+#include <errno.h>
+#include "wait.h"
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility push(hidden)
+#endif
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+				  unsigned long cur)
+{
+  /* FIXME: back off depending on how large expected - cur is.  */
+  do
+    {
+      cpu_relax ();
+      cur = __atomic_load_n (addr, MEMMODEL_RELAXED);
+      if (expected < cur)
+	return;
+    }
+  while (1);
+}
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility pop
+#endif
+
+#endif /* GOMP_DOACROSS_H */
--- libgomp/config/posix/doacross.h.jj	2015-09-23 12:17:53.217834221 +0200
+++ libgomp/config/posix/doacross.h	2015-09-24 10:51:18.310081801 +0200
@@ -0,0 +1,62 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a generic implementation of doacross spinning.  */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+#include <errno.h>
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility push(hidden)
+#endif
+
+static inline void
+cpu_relax (void)
+{
+  __asm volatile ("" : : : "memory");
+}
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+				  unsigned long cur)
+{
+  /* FIXME: back off depending on how large expected - cur is.  */
+  do
+    {
+      cpu_relax ();
+      cur = __atomic_load_n (addr, MEMMODEL_RELAXED);
+      if (expected < cur)
+	return;
+    }
+  while (1);
+}
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility pop
+#endif
+
+#endif /* GOMP_DOACROSS_H */
--- libgomp/testsuite/libgomp.c/doacross-1.c.jj	2015-09-23 13:38:24.270726352 +0200
+++ libgomp/testsuite/libgomp.c/doacross-1.c	2015-09-24 19:32:40.371922136 +0200
@@ -0,0 +1,182 @@
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8];
+volatile int d, e;
+
+int
+main ()
+{
+  int i, j, k, l, m;
+  #pragma omp parallel private (l)
+  {
+    #pragma omp for schedule(static, 1) ordered (1) nowait
+    for (i = 0; i < N; i++)
+      {
+	#pragma omp atomic write
+	a[i] = 1;
+	#pragma omp ordered depend(sink: i - 1)
+	if (i)
+	  {
+	    #pragma omp atomic read
+	    l = a[i - 1];
+	    if (l < 2)
+	      abort ();
+	  }
+	#pragma omp atomic write
+	a[i] = 2;
+	if (i < N - 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i + 1];
+	    if (l == 3)
+	      abort ();
+	  }
+	#pragma omp ordered depend(source)
+	#pragma omp atomic write
+	a[i] = 3;
+      }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (i = 2; i < N / 16 - 1; i++)
+      for (j = 0; j < 8; j += 2)
+	for (k = 1; k <= 3; k++)
+	  {
+	    #pragma omp atomic write
+	    b[i][j][k] = 1;
+	    #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+				depend(sink: i - 2, j - 2, k + 1)
+	    #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+	    if (j >= 2 && k > 1)
+	      {
+		#pragma omp atomic read
+		l = b[i][j - 2][k - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    b[i][j][k] = 2;
+	    if (i >= 4 && j >= 2 && k < 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 2][j - 2][k + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (i >= 5 && j < N / 16 - 3 && k == 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 3][j + 2][k - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    b[i][j][k] = 3;
+	  }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+    D(m)
+#undef A
+    #pragma omp for collapse (2) ordered(60) schedule(dynamic, 15)
+    for (i = 0; i < N / 32; i++)
+      for (j = 7; j > 1; j--)
+	for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+	  D(m)
+#undef A
+	    {
+	      #pragma omp atomic write
+	      c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+	      #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+				  depend (sink:i - 2, j + 1, k - 4 E(m)) \
+				  depend(sink: i - 1, j - 2, k - 2 E(m))
+	      if (k <= 4)
+		{
+		  l = c[i][j][k + 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp atomic write
+	      c[i][j][k] = 2;
+	      if (i >= 2 && j < 7 && k >= 4)
+		{
+		  l = c[i - 2][j + 1][k - 4];
+		  if (l < 2)
+		    abort ();
+		}
+	      if (i >= 1 && j >= 4 && k >= 2)
+		{
+		  l = c[i - 1][j - 2][k - 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp ordered depend (source)
+	      #pragma omp atomic write
+	      c[i][j][k] = 3;
+	    }
+
+    #pragma omp for collapse(2) ordered(3) lastprivate (i, j, k)
+    for (i = 0; i < d + 1; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d; k++)
+	  for (l = 0; l < d + 2; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp single
+    {
+      if (i != 1 || j != -1 || k != 0)
+	abort ();
+      i = 8; j = 9; k = 10;
+    }
+    #pragma omp for collapse(2) ordered(3) lastprivate (i, j, k, m)
+    for (i = 0; i < d + 1; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (m = 0; m < d; m++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp single
+    if (i != 1 || j != -1 || k != 2 || m != 0)
+      abort ();
+    #pragma omp for collapse(2) ordered(3) nowait
+    for (i = 0; i < d + 1; i++)
+      for (j = d; j > 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (l = 0; l < d + 4; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp for nowait
+    for (i = 0; i < N; i++)
+      if (a[i] != 3)
+	abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 4; k++)
+	  if (b[i][j][k] != 3 * (i >= 2 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1))
+	    abort ();
+    #pragma omp for collapse(3) nowait
+    for (i = 0; i < N / 32; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 8; k++)
+	  if (c[i][j][k] != 3 * (j >= 2 && (k & 1) == 0))
+	    abort ();
+  }
+  return 0;
+}

	Jakub

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-10-08 14:26 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-24 20:04 [gomp4.1] Doacross library implementation Jakub Jelinek
2015-09-29 18:33 ` [gomp4.1] Fixup handling of doacross loops with noreturn body Jakub Jelinek
2015-10-01 12:08 ` [gomp4.1] Fixup doacross lastprivate handling Jakub Jelinek
2015-10-08 12:48 ` [gomp4.1] Doacross library implementation Torvald Riegel
2015-10-08 14:26   ` Aldy Hernandez

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).