public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH/ fortran] PR35423 omp workshare - first patch
@ 2008-06-28 18:00 Vasilis Liaskovitis
  2008-07-30 21:43 ` Jakub Jelinek
  0 siblings, 1 reply; 3+ messages in thread
From: Vasilis Liaskovitis @ 2008-06-28 18:00 UTC (permalink / raw)
  To: fortran, gcc-patches; +Cc: Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1786 bytes --]

Hi,

attached is a first patch implementing parallelization for some
statements within OMP WORKSHARE constructs.

This patch:
- parallelizes array assignments handled by the scalarizer by building
an OMP_FOR
- parallelizes simple where and where..else constructs in the same way.
- clusters all other code (i.e. statements that are not workshared)
into OMP SINGLE blocks
Currently, barriers are placed after every single block or omp_for
workshared construct.

Let me know if someone is interested in reviewing. Any
comments/feedback welcome.

Speedups for the attached bm-*f90 programs using the patched version
with "-O2 -fopenmp" and 1-8 threads:

			1	2	4	8
bm-array	 1.00	 1.98	3.92   7.38
bm-where	1.00	1.94   3.6    7.46

unpatched version has single-thread performance for all threads, since
everything is currently wrapped in omp single. With the patch, the
workshare construct performs the same as as a standard omp do loop for
an array assignment.

bootstrapped and regtested on x86_64 against trunk. Also attached are
some new testcases for workshare constructs.

I have also done some work on the following 2 items, let me know if
you are interested in a patch:
- do dependence analysis between statements so that openmp barriers
are placed only where needed to satisfy dependences. Currently there
is a barrier at the end of every workshared OMP_FOR loop and at the
end of every OMP_SINGLE block.
- workshare array assignments handled with builtin memcpy, memset (see
http://gcc.gnu.org/ml/gcc/2008-04/msg00232.html)

worksharing for intrinsic functions (e.g. SUM, COUNT) is not
implemented. I had some issues debugging this a while back (see thread
above) and haven't looked further into it. I don't plan to do this
unless someone expresses interest.

thanks,

- Vasilis

[-- Attachment #2: pr35423.diff --]
[-- Type: application/octet-stream, Size: 11778 bytes --]

Index: gcc/fortran/trans-expr.c
===================================================================
--- gcc/fortran/trans-expr.c	(revision 137213)
+++ gcc/fortran/trans-expr.c	(working copy)
@@ -4376,6 +4376,10 @@
   rss = NULL;
   if (lss != gfc_ss_terminator)
     {
+
+      /*allow the scalarizer to workshare array assignments*/
+      if(ws_data.workshareflag) ws_data.scalarizer_workshare = true;
+
       /* The assignment needs scalarization.  */
       lss_section = lss;
 
Index: gcc/fortran/trans-array.c
===================================================================
--- gcc/fortran/trans-array.c	(revision 137213)
+++ gcc/fortran/trans-array.c	(working copy)
@@ -2630,41 +2630,85 @@
   tree tmp;
   tree loopbody;
   tree exit_label;
+  tree stmt;
+  tree init;
+  tree incr;
 
-  loopbody = gfc_finish_block (pbody);
+  if (ws_data.workshareflag && ws_data.scalarizer_workshare && (n == loop->dimen-1))
+  {
+    /* we create an openmp for construct for the outermost scalarized loop*/
+    init = make_tree_vec (1);
+    cond = make_tree_vec (1);
+    incr = make_tree_vec (1);
+    /* Cycle statement is implemented with a goto.  Exit statement must not be
+       present for this loop.  */
+    exit_label = gfc_build_label_decl (NULL_TREE);
+    TREE_USED (exit_label) = 1;
 
-  /* Initialize the loopvar.  */
-  gfc_add_modify_expr (&loop->code[n], loop->loopvar[n], loop->from[n]);
+    /* Label for cycle statements (if needed).  */
+    tmp = build1_v (LABEL_EXPR, exit_label);
+    gfc_add_expr_to_block (pbody, tmp);
 
-  exit_label = gfc_build_label_decl (NULL_TREE);
+    stmt = make_node (OMP_FOR);
 
-  /* Generate the loop body.  */
-  gfc_init_block (&block);
+    TREE_TYPE (stmt) = void_type_node;
+    OMP_FOR_BODY (stmt) = loopbody = gfc_finish_block (pbody);
 
-  /* The exit condition.  */
-  cond = fold_build2 (GT_EXPR, boolean_type_node,
-		      loop->loopvar[n], loop->to[n]);
-  tmp = build1_v (GOTO_EXPR, exit_label);
-  TREE_USED (exit_label) = 1;
-  tmp = build3_v (COND_EXPR, cond, tmp, build_empty_stmt ());
-  gfc_add_expr_to_block (&block, tmp);
+    OMP_FOR_CLAUSES (stmt) = ws_data.workshare_clauses;
+    /* Initialize the loopvar.  */
+    TREE_VEC_ELT (init, 0) = build2_v (GIMPLE_MODIFY_STMT, loop->loopvar[n], loop->from[n]);
+    OMP_FOR_INIT (stmt) = init; 
+    /* The exit condition.  */
+    TREE_VEC_ELT (cond, 0) = build2 (LE_EXPR, boolean_type_node, loop->loopvar[n], loop->to[n]);
+    OMP_FOR_COND (stmt) = cond;
+    /* Increment the loopvar.  */
+    tmp = build2 (PLUS_EXPR, gfc_array_index_type,
+	loop->loopvar[n], gfc_index_one_node);
+    TREE_VEC_ELT (incr, 0) = fold_build2 (GIMPLE_MODIFY_STMT,
+	void_type_node, loop->loopvar[n], tmp);
+    OMP_FOR_INCR (stmt) = incr;
 
-  /* The main body.  */
-  gfc_add_expr_to_block (&block, loopbody);
+    ws_data.curr_singleunit = false;
+    gfc_add_expr_to_block (&loop->code[n], stmt);
+  }
+  else
+  {
+    loopbody = gfc_finish_block (pbody);
 
-  /* Increment the loopvar.  */
-  tmp = fold_build2 (PLUS_EXPR, gfc_array_index_type,
-		     loop->loopvar[n], gfc_index_one_node);
-  gfc_add_modify_expr (&block, loop->loopvar[n], tmp);
+    /* Initialize the loopvar.  */
+    gfc_add_modify_expr (&loop->code[n], loop->loopvar[n], loop->from[n]);
 
-  /* Build the loop.  */
-  tmp = gfc_finish_block (&block);
-  tmp = build1_v (LOOP_EXPR, tmp);
-  gfc_add_expr_to_block (&loop->code[n], tmp);
+    exit_label = gfc_build_label_decl (NULL_TREE);
 
-  /* Add the exit label.  */
-  tmp = build1_v (LABEL_EXPR, exit_label);
-  gfc_add_expr_to_block (&loop->code[n], tmp);
+    /* Generate the loop body.  */
+    gfc_init_block (&block);
+
+    /* The exit condition.  */
+    cond = fold_build2 (GT_EXPR, boolean_type_node,
+		       loop->loopvar[n], loop->to[n]);
+    tmp = build1_v (GOTO_EXPR, exit_label);
+    TREE_USED (exit_label) = 1;
+    tmp = build3_v (COND_EXPR, cond, tmp, build_empty_stmt ());
+    gfc_add_expr_to_block (&block, tmp);
+
+    /* The main body.  */
+    gfc_add_expr_to_block (&block, loopbody);
+
+    /* Increment the loopvar.  */
+    tmp = fold_build2 (PLUS_EXPR, gfc_array_index_type,
+		       loop->loopvar[n], gfc_index_one_node);
+    gfc_add_modify_expr (&block, loop->loopvar[n], tmp);
+
+    /* Build the loop.  */
+    tmp = gfc_finish_block (&block);
+    tmp = build1_v (LOOP_EXPR, tmp);
+    gfc_add_expr_to_block (&loop->code[n], tmp);
+
+    /* Add the exit label.  */
+    tmp = build1_v (LABEL_EXPR, exit_label);
+    gfc_add_expr_to_block (&loop->code[n], tmp);
+  }
+  
 }
 
 
Index: gcc/fortran/trans-openmp.c
===================================================================
--- gcc/fortran/trans-openmp.c	(revision 137213)
+++ gcc/fortran/trans-openmp.c	(working copy)
@@ -35,6 +35,7 @@
 #include "trans-const.h"
 #include "arith.h"
 
+omp_workshare_data ws_data;
 
 /* True if OpenMP should privatize what this DECL points to rather
    than the DECL itself.  */
@@ -1518,6 +1519,7 @@
   return stmt;
 }
 
+
 static tree
 gfc_trans_omp_task (gfc_code *code)
 {
@@ -1546,8 +1548,181 @@
 static tree
 gfc_trans_omp_workshare (gfc_code *code, gfc_omp_clauses *clauses)
 {
-  /* XXX */
-  return gfc_trans_omp_single (code, clauses);
+  tree res, tmp, omp_clauses = NULL_TREE, stmt;
+  stmtblock_t block, *pblock = NULL;
+  stmtblock_t singleblock; 
+  gfc_omp_clauses ws_clauses;
+  tree workshare_clauses_wait;
+
+  bool singleblock_inprogress = false;
+
+  /* build tree node for default omp clauses and for nowait omp clauses*/
+  if (clauses->sched_kind == OMP_SCHED_NONE)
+    clauses->sched_kind = OMP_SCHED_STATIC;
+  omp_clauses = gfc_trans_omp_clauses (pblock, clauses, code->loc);
+  
+  /* build tree node for worksharing do clauses*/
+  memset (&ws_clauses, 0, sizeof (ws_clauses));
+  ws_clauses.sched_kind = OMP_SCHED_STATIC;
+  ws_clauses.chunk_size = clauses->chunk_size;
+  ws_clauses.ordered = clauses->ordered;
+  /* worksharing only outermost loops */
+  ws_clauses.collapse = 1; 
+  /* no depenedence analysis yet, we wait after each single 
+     block or wokshared gfc_code */
+  ws_clauses.nowait = false;
+
+  workshare_clauses_wait = gfc_trans_omp_clauses (pblock, &ws_clauses, code->loc);
+  
+  code = code->block->next; 
+
+  pushlevel (0);
+
+  if (!code)
+    return build_empty_stmt ();
+
+  gfc_start_block (&block);
+  pblock = █
+
+  ws_data.workshareflag = true;
+  ws_data.prev_singleunit = false;
+
+
+  /* Translate statements one by one to GIMPLE trees until we reach 
+     the end of the workshare construct.  Adjacent gfc_codes that 
+     are a single unit of work are clustered and encapsulated in a 
+     single OMP_SINGLE construct.  */
+  for (; code; code = code->next)
+    {
+      if (code->here != 0)
+	{
+	  res = gfc_trans_label_here (code);
+	  gfc_add_expr_to_block (pblock, res);
+	}
+
+      /* no dependence analysis yet, we wait at the end of every 
+         single block or workshared construct */
+      if (code->next == NULL) 
+        ws_data.workshare_clauses = omp_clauses;
+      else
+        ws_data.workshare_clauses = workshare_clauses_wait;
+
+      /* by default, every gfc_code is a single unit of work */
+      ws_data.curr_singleunit = true;
+      ws_data.scalarizer_workshare = false;
+
+      switch (code->op)
+	{
+	case EXEC_NOP:
+	  res = NULL_TREE;
+	  break;
+
+	case EXEC_ASSIGN:
+	  res = gfc_trans_assign (code);
+	  break;
+
+	case EXEC_POINTER_ASSIGN:
+	  res = gfc_trans_pointer_assign (code);
+	  break;
+
+	case EXEC_INIT_ASSIGN:
+	  res = gfc_trans_init_assign (code);
+	  break;
+
+	case EXEC_FORALL:
+	  res = gfc_trans_forall (code);
+	  break;
+
+	case EXEC_WHERE:
+	  res = gfc_trans_where (code);
+	  break;
+
+	case EXEC_OMP_ATOMIC:
+	case EXEC_OMP_PARALLEL:
+	  res = gfc_trans_omp_directive (code);
+          break;
+
+	case EXEC_OMP_BARRIER:
+	case EXEC_OMP_CRITICAL:
+	  res = gfc_trans_omp_directive (code);
+          /* barriers and critical regions are executed by each thread */
+          ws_data.curr_singleunit = false;
+	  break;
+	
+	default:
+	  internal_error ("gfc_trans_omp_workshare(): Bad statement code");
+	}
+
+      gfc_set_backend_locus (&code->loc);
+      
+      if (res != NULL_TREE && ! IS_EMPTY_STMT (res))
+      {
+	if (TREE_CODE (res) == STATEMENT_LIST)
+	  annotate_all_with_locus (&res, input_location);
+	else
+	  SET_EXPR_LOCATION (res, input_location);
+
+	if(ws_data.prev_singleunit)
+	{
+
+	  if(ws_data.curr_singleunit)
+	  {
+	    /* add current gfc_code to single block */
+	    gfc_add_expr_to_block (&singleblock, res);
+	  }
+	  else 
+	  {
+	    /* finish single block and add it to pblock */
+	    tmp = gfc_finish_block (&singleblock);
+	    tmp = build2_v (OMP_SINGLE, tmp, ws_data.workshare_clauses);
+	    gfc_add_expr_to_block (pblock, tmp);
+	    /* add current gfc_code to pblock */
+	    gfc_add_expr_to_block (pblock, res);
+	    singleblock_inprogress = false;
+	  }
+	}
+	else 
+	{
+	  if(ws_data.curr_singleunit)
+	  {
+	    /* start single block */
+	    gfc_init_block (&singleblock);
+	    gfc_add_expr_to_block (&singleblock, res);
+	    singleblock_inprogress = true;
+	  }
+	  else
+	    /* Add the new statement to the block.  */
+	    gfc_add_expr_to_block (pblock, res);
+	}
+	ws_data.prev_singleunit = ws_data.curr_singleunit;
+      }
+    }
+  /* finish remaining SINGLE block, if we were in the middle of one */
+  if(singleblock_inprogress)
+    {
+      /* finish single block and add it to pblock */
+      tmp = gfc_finish_block (&singleblock);
+      tmp = build2_v (OMP_SINGLE, tmp, omp_clauses);
+      gfc_add_expr_to_block (pblock, tmp);
+      singleblock_inprogress = false;
+    }
+
+  stmt = gfc_finish_block (pblock);
+  if (TREE_CODE (stmt) != BIND_EXPR)
+    {
+      if (!IS_EMPTY_STMT (stmt))
+	{
+	  tree bindblock = poplevel (1, 0, 0);
+	  stmt = build3_v (BIND_EXPR, NULL, stmt, bindblock);
+	}
+      else
+	poplevel (0, 0, 0);
+    }
+  else
+    poplevel (0, 0, 0);
+
+  ws_data.workshareflag = false;
+  return stmt;
 }
 
 tree
@@ -1591,3 +1766,4 @@
       gcc_unreachable ();
     }
 }
+
Index: gcc/fortran/trans-stmt.c
===================================================================
--- gcc/fortran/trans-stmt.c	(revision 137213)
+++ gcc/fortran/trans-stmt.c	(working copy)
@@ -39,6 +39,7 @@
 #include "arith.h"
 #include "dependency.h"
 
+
 typedef struct iter_info
 {
   tree var;
@@ -3584,6 +3585,8 @@
   gfc_ss *edss = 0;
   gfc_ss *esss = 0;
 
+  /*allow the scalarizer to workshare simple where loops*/
+  if(ws_data.workshareflag) ws_data.scalarizer_workshare = true;
   cond = cblock->expr;
   tdst = cblock->next->expr;
   tsrc = cblock->next->expr2;
Index: gcc/fortran/trans.c
===================================================================
--- gcc/fortran/trans.c	(revision 137213)
+++ gcc/fortran/trans.c	(working copy)
@@ -1170,6 +1170,7 @@
 void
 gfc_generate_code (gfc_namespace * ns)
 {
+  ws_data.workshareflag = false;
   if (ns->is_block_data)
     {
       gfc_generate_block_data (ns);
Index: gcc/fortran/trans.h
===================================================================
--- gcc/fortran/trans.h	(revision 137213)
+++ gcc/fortran/trans.h	(working copy)
@@ -745,5 +745,15 @@
 extern const char gfc_msg_fault[];
 extern const char gfc_msg_wrong_return[];
 
+typedef struct omp_workshare_data {
+  bool workshareflag; /* true if in a workshare construct */
+  bool prev_singleunit; /*true if previous gfc_code in workshare construct is not workshared*/
+  bool curr_singleunit; /*true if current gfc_code in workshare construct is not workshared*/
+  bool scalarizer_workshare; /*true if scalarizer should parallelize loops*/
+  tree workshare_clauses;
+}
+omp_workshare_data;
 
+extern omp_workshare_data ws_data;
+
 #endif /* GFC_TRANS_H */

[-- Attachment #3: pr35423.changelog --]
[-- Type: application/octet-stream, Size: 864 bytes --]


2008-06-23  Vasilis Liaskovitis <vliaskov@gmail.com>

        * trans.h: Add omp_workshare_data data structure definition for worksharing
        * trans-openmp.c (gfc_trans_omp_workshare): Change gfc_trans_omp_workshare. 
        Treat all gfc_codes as a single unit of work unless worksharing is enabled
        for a particular gfc_code. Adjacent single units of work are clustered in 
        one SINGLE openmp construct. 
        * trans-array.c (gfc_trans_scalarized_loop_end): Create OMP_FOR loop for 
        outermost loop if worksharing is toggled 
        * trans-expr.c (gfc_trans_assignment_1): toggle worksharing flag for array 
        assignments 
        * trans-stmt.c (gfc_trans_where_3): toggle worksharing flag for simple 
        where and where..else statements 
        * trans.c (gfc_generate_code): initialize worksharing flag to false 

[-- Attachment #4: pr35423-testcases.tar.gz --]
[-- Type: application/x-gzip, Size: 786 bytes --]

[-- Attachment #5: bm-pr35423-array.f90 --]
[-- Type: application/octet-stream, Size: 389 bytes --]


! { dg-do compile }
      PROGRAM A11
      PARAMETER (N=512)
      REAL AA(N,N), BB(N,N)
      CALL A11_1(AA, BB, N)
      END PROGRAM A11

      SUBROUTINE A11_1(AA, BB, N)
      REAL AA(N,N), BB(N,N)
      INTEGER N
      INTEGER I
      DO I=1,262144
!$OMP PARALLEL 
!$OMP WORKSHARE 
            AA = BB
!$OMP END WORKSHARE
!$OMP END PARALLEL
      END DO
      END SUBROUTINE A11_1


[-- Attachment #6: bm-pr35423-where.f90 --]
[-- Type: application/octet-stream, Size: 492 bytes --]


! { dg-do compile }
      PROGRAM A11
      PARAMETER (N=512)
      REAL AA(N,N), BB(N,N)
      CALL A11_1(AA, BB, N)
      END PROGRAM A11

      SUBROUTINE A11_1(AA, BB, N)
      REAL AA(N,N), BB(N,N)
      INTEGER N
      INTEGER I,J
      DO I=1,N
      DO J=1,N
        BB(I,I) = 2.0 
      END DO
      END DO
      DO I=1,262144
!$OMP PARALLEL 
!$OMP WORKSHARE 
            WHERE (BB .ne. 0) AA = 1 / BB
!$OMP END WORKSHARE
!$OMP END PARALLEL
      END DO
      END SUBROUTINE A11_1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH/ fortran] PR35423 omp workshare - first patch
  2008-06-28 18:00 [PATCH/ fortran] PR35423 omp workshare - first patch Vasilis Liaskovitis
@ 2008-07-30 21:43 ` Jakub Jelinek
  2008-08-08  6:12   ` Vasilis Liaskovitis
  0 siblings, 1 reply; 3+ messages in thread
From: Jakub Jelinek @ 2008-07-30 21:43 UTC (permalink / raw)
  To: Vasilis Liaskovitis; +Cc: fortran, gcc-patches

Hi!

On Sat, Jun 28, 2008 at 12:52:34PM -0500, Vasilis Liaskovitis wrote:
> attached is a first patch implementing parallelization for some
> statements within OMP WORKSHARE constructs.

Sorry for the delay, your mail was accidentally caught by SPAM filtering,
so I only discovered it much later when looking why some mails got lost.

First of all, do you have a Copyright assignment on file?  That's the
precondition of getting the code accepted.

The patch generally looks good, but there are a couple of issues:

1) I don't see why ws_data.prev_singleunit can't be a local variable in
   gfc_trans_omp_workshare, no need to make it a field in the global var
2) ws_data.workshare_clauses is problematic; the gfc_trans_omp_workshare
   argument to gfc_trans_omp_workshare will only contain nowait or nothing,
   so you should just pass that info around as a bool in ws_data, and
   let both OMP_FOR and OMP_SINGLE create their own clauses tree chain
   - it shouldn't be shared anyway and for OMP_SINGLE most of the
   OMP_FOR-ish clauses shouldn't appear.  BTW, no need to add collapse(1)
   clause, that's the default if no collapse clause is present.
3) ws_data as a global variable is problematic with nesting.  Say
!$omp workshare
  a(:) = b(:)
  !$omp parallel
  c(:) = d(:)
  !$omp workshare
  e(:) = f(:)
  !$omp end workshare
  !$omp end parallel
  g(:) = h(:)
!$omp end workshare
   This could be solved e.g. by copying ws_data into a temporary variable,
   clearing it and restoring afterwards around gfc_trans_omp_directive
   for EXEC_OMP_PARALLEL*.
4) EXEC_OMP_BARRIER must not appear in !$omp workshare, so you shouldn't
   handle it there
5) /* barriers and critical regions are executed by each thread */
   ws_data.curr_singleunit = false;
   for EXEC_OMP_CRITICAL is wrong, the critical region is a single unit
   of work and as such executed just by one of the threads.  But
   you want to clear or save/clear/restore some ws_data fields
   around gfc_trans_omp_directive for EXEC_OMP_CRITICAL too
   - the whole critical is single unit of work, so OMP_FOR shouldn't
   be created in it.
6) you only handle EXEC_OMP_PARALLEL in gfc_trans_omp_workshare,
   but you should handle EXEC_OMP_PARALLEL_{DO,SECTIONS,WORKSHARE}
   the same way
7) please watch formatting
   - space between if and (
   - < 80 column lines
   - {} for if are idented 2 more spaces from if, the body another
     2
   - comments should be sentences, starting with capital letter
     and ending with ., followed by 2 spaces, not just one
   - single statement then or else blocks for if don't need {}s around it
8) now that trunk has been tuplified, you'll need to change slightly
   the creation of OMP_FOR (GIMPLE_MODIFY_STMT is gone)

> I have also done some work on the following 2 items, let me know if
> you are interested in a patch:
> - do dependence analysis between statements so that openmp barriers
> are placed only where needed to satisfy dependences. Currently there
> is a barrier at the end of every workshared OMP_FOR loop and at the
> end of every OMP_SINGLE block.

Sure, this is very much desirable, but should be probably handled after
your initial patch is committed.  After the dependence analysis is in,
also estimating how expensive a singleunit is would be desirable too,
and using that together with the dependency analysis to decide what
should be merged into an OMP_SINGLE vs. what should be in different
OMP_SINGLEs (or whether say OMP_SECTIONS should be created instead,
scheduling the various single units of work as separate sections,
as long as there aren't dependencies in between them).
Say EXEC_OMP_PARALLEL* can be always predicted to be expensive and
so should be parallelized separately, scalar assignments are cheap and
can be always merged, etc.

> - workshare array assignments handled with builtin memcpy, memset (see
> http://gcc.gnu.org/ml/gcc/2008-04/msg00232.html)

Depends whether it will be profitable or not.

	Jakub

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH/ fortran] PR35423 omp workshare - first patch
  2008-07-30 21:43 ` Jakub Jelinek
@ 2008-08-08  6:12   ` Vasilis Liaskovitis
  0 siblings, 0 replies; 3+ messages in thread
From: Vasilis Liaskovitis @ 2008-08-08  6:12 UTC (permalink / raw)
  To: Jakub Jelinek, fortran, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 6077 bytes --]

Hi,

On Wed, Jul 30, 2008 at 12:15 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> On Sat, Jun 28, 2008 at 12:52:34PM -0500, Vasilis Liaskovitis wrote:
>> attached is a first patch implementing parallelization for some
>> statements within OMP WORKSHARE constructs.
>
> Sorry for the delay, your mail was accidentally caught by SPAM filtering,
> so I only discovered it much later when looking why some mails got lost.
>
> First of all, do you have a Copyright assignment on file?  That's the
> precondition of getting the code accepted.

thanks for reviewing. I don't have a copyright assignment - I sent a
request for forms to assign@gnu.org just now.

> The patch generally looks good, but there are a couple of issues:
>
> 1) I don't see why ws_data.prev_singleunit can't be a local variable in
>   gfc_trans_omp_workshare, no need to make it a field in the global var

you are obviously right, fixed.

> 2) ws_data.workshare_clauses is problematic; the gfc_trans_omp_workshare
>   argument to gfc_trans_omp_workshare will only contain nowait or nothing,
>   so you should just pass that info around as a bool in ws_data, and
>   let both OMP_FOR and OMP_SINGLE create their own clauses tree chain
>   - it shouldn't be shared anyway and for OMP_SINGLE most of the
>   OMP_FOR-ish clauses shouldn't appear.  BTW, no need to add collapse(1)
>   clause, that's the default if no collapse clause is present.

I agree that the nowait boolean value is the only information needed
to be passed around in ws_data. However, since I create OMP_FOR
clauses in different files (e.g. gfc_scalarized_loop_end in
trans-array.c), I 'd need more info to construct the actual tree
clauses there (for example, arguments to gfc_trans_omp_clauses are not
present in the aforementioned function).

The attached patch creates two tree clauses in trans-openmp.c, one for
OMP_FOR clauses and one for OMP_SINGLE clauses i.e. single and for
clauses now do not share clauses. ws_data only passes around the
OMP_FOR clause, since SINGLE clauses are always created locally in
gfc_trans_omp_workshare . The tree chains of workshare FOR and SINGLE
clauses remain the same throughout the handling of the workshare
construct (with dependence analysis we 'd need to pick between a
nowait and wait flavour for both types of clauses as we walk the
gfc_codes). If you still think that more flexibility building tree
chains is going to be needed, let me know. it may require some other
changes.

collapse(1) is no longer added to ompfor clauses. chunk_size and
ordered are also removed, since the clauses argument is guaranteed to
not contain them.

> 3) ws_data as a global variable is problematic with nesting.  Say
> !$omp workshare
>  a(:) = b(:)
>  !$omp parallel
>  c(:) = d(:)
>  !$omp workshare
>  e(:) = f(:)
>  !$omp end workshare
>  !$omp end parallel
>  g(:) = h(:)
> !$omp end workshare
>   This could be solved e.g. by copying ws_data into a temporary variable,
>   clearing it and restoring afterwards around gfc_trans_omp_directive
>   for EXEC_OMP_PARALLEL*.

I am not 100% clear on what the scope of WORKSHARE should be with
regards to a nested PARALLEL region. In my understanding of omp spec
3.0 (section 2.5.4), given the following code

!$OMP PARALLEL
!$OMP WORKSHARE
!$OMP PARALLEL
            SHR = SHR + 1
            AA(:) = BB(:)
!$OMP END PARALLEL
!$OMP END WORKSHARE
!$OMP END PARALLEL

- the inner PARALLEL region should be treated as a single unit of work
by the thread team bound to the WORKSHARE construct (this is the outer
parallel team).
- Workshare semantics do not apply to the statements of the inner
parallel region. So,  the scalar&array assignment are executed by all
threads of the inner parallel team and the array assignment should not
be workshared.

If these are correct assumptions, the new patch should be ok: as you
proposed, the ws_data structure is copied out, reset to 0, and
restored around gfc_trans_omp_directive for EXEC_OMP_PARALLEL*. (these
are done with memset, memcpy though the data structure is rather
small. If desired I can change it to per-field reset/copy)

> 4) EXEC_OMP_BARRIER must not appear in !$omp workshare, so you shouldn't
>   handle it there

done.

> 5) /* barriers and critical regions are executed by each thread */
>   ws_data.curr_singleunit = false;
>   for EXEC_OMP_CRITICAL is wrong, the critical region is a single unit
>   of work and as such executed just by one of the threads.  But
>   you want to clear or save/clear/restore some ws_data fields
>   around gfc_trans_omp_directive for EXEC_OMP_CRITICAL too
>   - the whole critical is single unit of work, so OMP_FOR shouldn't
>   be created in it.

Similarly, ws_data is now copied, reset and restored around
gfc_trans_omp_directive for OMP_CRITICAL. This ensures that e.g. an
OMP_FOR is not created for an array assignment inside OMP_CRITICAL .

We can still get OMP_FOR for a parallel workshared team within OMP_CRITICAL :

!$OMP PARALLEL
!$OMP WORKSHARE
!$OMP CRITICAL
!$OMP PARALLEL
!$OMP WORKSHARE
            DD(:) = BB(:)
!$OMP END WORKSHARE
!$OMP END PARALLEL
!$OMP END CRITICAL
!$OMP END WORKSHARE
!$OMP END PARALLEL

but I think this is the intended behaviour here.

If the above sounds right, the handling of OMP_PARALLEL* and
OMP_CRITICAL is identical. I have merged their handling in the patch.

> 6) you only handle EXEC_OMP_PARALLEL in gfc_trans_omp_workshare,
>   but you should handle EXEC_OMP_PARALLEL_{DO,SECTIONS,WORKSHARE}
>   the same way

done.

> 7) please watch formatting
>   - space between if and (
>   - < 80 column lines
>   - {} for if are idented 2 more spaces from if, the body another
>     2
>   - comments should be sentences, starting with capital letter
>     and ending with ., followed by 2 spaces, not just one
>   - single statement then or else blocks for if don't need {}s around it

sorry for these,  fixed most but there may still be more

> 8) now that trunk has been tuplified, you'll need to change slightly
>   the creation of OMP_FOR (GIMPLE_MODIFY_STMT is gone)

done, now uses MODIFY_EXPR


thanks,

- Vasilis

[-- Attachment #2: pr35423-b.diff --]
[-- Type: application/octet-stream, Size: 12449 bytes --]

Index: gcc/fortran/trans-array.c
===================================================================
--- gcc/fortran/trans-array.c	(revision 138760)
+++ gcc/fortran/trans-array.c	(working copy)
@@ -2634,41 +2634,88 @@
   tree tmp;
   tree loopbody;
   tree exit_label;
+  tree stmt;
+  tree init;
+  tree incr;
 
-  loopbody = gfc_finish_block (pbody);
+  if (ws_data.workshareflag && ws_data.scalarizer_workshare 
+      && (n == loop->dimen-1))
+    {
+      /* We create an OMP_FOR construct for the outermost scalarized loop.  */
+      init = make_tree_vec (1);
+      cond = make_tree_vec (1);
+      incr = make_tree_vec (1);
+      /* Cycle statement is implemented with a goto.  Exit statement must not 
+         be present for this loop.  */
+      exit_label = gfc_build_label_decl (NULL_TREE);
+      TREE_USED (exit_label) = 1;
 
-  /* Initialize the loopvar.  */
-  gfc_add_modify (&loop->code[n], loop->loopvar[n], loop->from[n]);
+      /* Label for cycle statements (if needed).  */
+      tmp = build1_v (LABEL_EXPR, exit_label);
+      gfc_add_expr_to_block (pbody, tmp);
 
-  exit_label = gfc_build_label_decl (NULL_TREE);
+      stmt = make_node (OMP_FOR);
 
-  /* Generate the loop body.  */
-  gfc_init_block (&block);
+      TREE_TYPE (stmt) = void_type_node;
+      OMP_FOR_BODY (stmt) = loopbody = gfc_finish_block (pbody);
 
-  /* The exit condition.  */
-  cond = fold_build2 (GT_EXPR, boolean_type_node,
-		      loop->loopvar[n], loop->to[n]);
-  tmp = build1_v (GOTO_EXPR, exit_label);
-  TREE_USED (exit_label) = 1;
-  tmp = build3_v (COND_EXPR, cond, tmp, build_empty_stmt ());
-  gfc_add_expr_to_block (&block, tmp);
+      OMP_FOR_CLAUSES (stmt) = ws_data.for_clauses;
+      /* Initialize the loopvar.  */
+      TREE_VEC_ELT (init, 0) = build2_v (MODIFY_EXPR, loop->loopvar[n], 
+                                         loop->from[n]);
+      OMP_FOR_INIT (stmt) = init; 
+      /* The exit condition.  */
+      TREE_VEC_ELT (cond, 0) = build2 (LE_EXPR, boolean_type_node, 
+                                       loop->loopvar[n], loop->to[n]);
+      OMP_FOR_COND (stmt) = cond;
+      /* Increment the loopvar.  */
+      tmp = build2 (PLUS_EXPR, gfc_array_index_type,
+	  loop->loopvar[n], gfc_index_one_node);
+      TREE_VEC_ELT (incr, 0) = fold_build2 (MODIFY_EXPR,
+	  void_type_node, loop->loopvar[n], tmp);
+      OMP_FOR_INCR (stmt) = incr;
 
-  /* The main body.  */
-  gfc_add_expr_to_block (&block, loopbody);
+      ws_data.curr_singleunit = false;
+      gfc_add_expr_to_block (&loop->code[n], stmt);
+    }
+  else
+    {
+      loopbody = gfc_finish_block (pbody);
 
-  /* Increment the loopvar.  */
-  tmp = fold_build2 (PLUS_EXPR, gfc_array_index_type,
-		     loop->loopvar[n], gfc_index_one_node);
-  gfc_add_modify (&block, loop->loopvar[n], tmp);
+      /* Initialize the loopvar.  */
+      gfc_add_modify (&loop->code[n], loop->loopvar[n], loop->from[n]);
 
-  /* Build the loop.  */
-  tmp = gfc_finish_block (&block);
-  tmp = build1_v (LOOP_EXPR, tmp);
-  gfc_add_expr_to_block (&loop->code[n], tmp);
+      exit_label = gfc_build_label_decl (NULL_TREE);
 
-  /* Add the exit label.  */
-  tmp = build1_v (LABEL_EXPR, exit_label);
-  gfc_add_expr_to_block (&loop->code[n], tmp);
+      /* Generate the loop body.  */
+      gfc_init_block (&block);
+
+      /* The exit condition.  */
+      cond = fold_build2 (GT_EXPR, boolean_type_node,
+		         loop->loopvar[n], loop->to[n]);
+      tmp = build1_v (GOTO_EXPR, exit_label);
+      TREE_USED (exit_label) = 1;
+      tmp = build3_v (COND_EXPR, cond, tmp, build_empty_stmt ());
+      gfc_add_expr_to_block (&block, tmp);
+
+      /* The main body.  */
+      gfc_add_expr_to_block (&block, loopbody);
+
+      /* Increment the loopvar.  */
+      tmp = fold_build2 (PLUS_EXPR, gfc_array_index_type,
+		         loop->loopvar[n], gfc_index_one_node);
+      gfc_add_modify (&block, loop->loopvar[n], tmp);
+
+      /* Build the loop.  */
+      tmp = gfc_finish_block (&block);
+      tmp = build1_v (LOOP_EXPR, tmp);
+      gfc_add_expr_to_block (&loop->code[n], tmp);
+
+      /* Add the exit label.  */
+      tmp = build1_v (LABEL_EXPR, exit_label);
+      gfc_add_expr_to_block (&loop->code[n], tmp);
+    }
+  
 }
 
 
Index: gcc/fortran/trans-expr.c
===================================================================
--- gcc/fortran/trans-expr.c	(revision 138760)
+++ gcc/fortran/trans-expr.c	(working copy)
@@ -4423,6 +4423,10 @@
   rss = NULL;
   if (lss != gfc_ss_terminator)
     {
+
+      /* Allow the scalarizer to workshare array assignments.  */
+      if (ws_data.workshareflag) ws_data.scalarizer_workshare = true;
+
       /* The assignment needs scalarization.  */
       lss_section = lss;
 
Index: gcc/fortran/trans-openmp.c
===================================================================
--- gcc/fortran/trans-openmp.c	(revision 138760)
+++ gcc/fortran/trans-openmp.c	(working copy)
@@ -35,6 +35,7 @@
 #include "trans-const.h"
 #include "arith.h"
 
+omp_workshare_data ws_data;
 
 /* True if OpenMP should privatize what this DECL points to rather
    than the DECL itself.  */
@@ -1543,8 +1544,194 @@
 static tree
 gfc_trans_omp_workshare (gfc_code *code, gfc_omp_clauses *clauses)
 {
-  /* XXX */
-  return gfc_trans_omp_single (code, clauses);
+  tree res, tmp, omp_clauses = NULL_TREE, stmt;
+  tree ompfor_clauses_wait, ompsingle_clauses_wait, ompsingle_clauses;
+  stmtblock_t block, *pblock = NULL;
+  stmtblock_t singleblock; 
+  gfc_omp_clauses for_clauses, single_clauses;
+  omp_workshare_data temp_ws_data;
+  bool singleblock_inprogress = false;
+  bool prev_singleunit; /* True if previous gfc_code in workshare construct 
+                           is not workshared.  */
+
+  /* Build tree node for default omp clauses.  */
+  if (clauses->sched_kind == OMP_SCHED_NONE)
+    clauses->sched_kind = OMP_SCHED_STATIC;
+  omp_clauses = gfc_trans_omp_clauses (pblock, clauses, code->loc);
+  
+  /* Build tree node for OMP_FOR clauses.  */
+  memset (&for_clauses, 0, sizeof (for_clauses));
+  for_clauses.sched_kind = OMP_SCHED_STATIC;
+  /* No dependence analysis yet, we wait after each wokshared gfc_code.  */
+  for_clauses.nowait = false;
+  ompfor_clauses_wait = gfc_trans_omp_clauses (pblock, &for_clauses, 
+                                               code->loc);
+
+  /* Build tree node for OMP_SINGLE clauses.  */
+  memset (&single_clauses, 0, sizeof (single_clauses));
+  /* No dependence analysis yet, we wait after each single block.  */
+  single_clauses.nowait = false;
+  ompsingle_clauses_wait = gfc_trans_omp_clauses (pblock, &single_clauses, 
+                                                  code->loc);
+
+  code = code->block->next; 
+
+  pushlevel (0);
+
+  if (!code)
+    return build_empty_stmt ();
+
+  gfc_start_block (&block);
+  pblock = &block;
+
+  ws_data.workshareflag = true;
+  prev_singleunit = false;
+
+
+  /* Translate statements one by one to trees until we reach 
+     the end of the workshare construct.  Adjacent gfc_codes that 
+     are a single unit of work are clustered and encapsulated in a 
+     single OMP_SINGLE construct.  */
+  for (; code; code = code->next)
+    {
+      if (code->here != 0)
+	{
+	  res = gfc_trans_label_here (code);
+	  gfc_add_expr_to_block (pblock, res);
+	}
+
+      /* No dependence analysis, use for clauses with wait. 
+         If this is the last gfc_code, use default omp_clauses.  */
+      if (code->next == NULL) 
+        ws_data.for_clauses = omp_clauses;
+      else
+        ws_data.for_clauses = ompfor_clauses_wait;
+      /* No dependence analysis, use single clauses with wait. 
+         If this is the last gfc_code, use default omp_clauses.  */
+      if (code->next == NULL) 
+        ompsingle_clauses = omp_clauses;
+      else
+        ompsingle_clauses = ompsingle_clauses_wait;
+
+      /* By default, every gfc_code is a single unit of work.  */
+      ws_data.curr_singleunit = true;
+      ws_data.scalarizer_workshare = false;
+
+      switch (code->op)
+	{
+	case EXEC_NOP:
+	  res = NULL_TREE;
+	  break;
+
+	case EXEC_ASSIGN:
+	  res = gfc_trans_assign (code);
+	  break;
+
+	case EXEC_POINTER_ASSIGN:
+	  res = gfc_trans_pointer_assign (code);
+	  break;
+
+	case EXEC_INIT_ASSIGN:
+	  res = gfc_trans_init_assign (code);
+	  break;
+
+	case EXEC_FORALL:
+	  res = gfc_trans_forall (code);
+	  break;
+
+	case EXEC_WHERE:
+	  res = gfc_trans_where (code);
+	  break;
+
+	case EXEC_OMP_ATOMIC:
+	  res = gfc_trans_omp_directive (code);
+          break;
+
+	case EXEC_OMP_PARALLEL:
+	case EXEC_OMP_PARALLEL_DO:
+	case EXEC_OMP_PARALLEL_SECTIONS:
+	case EXEC_OMP_PARALLEL_WORKSHARE:
+	case EXEC_OMP_CRITICAL:
+          memcpy (&temp_ws_data, &ws_data, sizeof(ws_data));
+          memset (&ws_data, 0, sizeof (ws_data));
+          res = gfc_trans_omp_directive (code);
+          memcpy (&ws_data, &temp_ws_data, sizeof(ws_data));
+          break;
+	
+	default:
+	  internal_error ("gfc_trans_omp_workshare(): Bad statement code");
+	}
+
+      gfc_set_backend_locus (&code->loc);
+      
+      if (res != NULL_TREE && ! IS_EMPTY_STMT (res))
+      {
+	if (TREE_CODE (res) == STATEMENT_LIST)
+	  tree_annotate_all_with_location (&res, input_location);
+	else
+	  SET_EXPR_LOCATION (res, input_location);
+
+	if (prev_singleunit)
+	{
+
+	  if (ws_data.curr_singleunit)
+	  {
+	    /* Add current gfc_code to single block.  */
+	    gfc_add_expr_to_block (&singleblock, res);
+	  }
+	  else 
+	  {
+	    /* Finish single block and add it to pblock.  */
+	    tmp = gfc_finish_block (&singleblock);
+	    tmp = build2_v (OMP_SINGLE, tmp, ompsingle_clauses);
+	    gfc_add_expr_to_block (pblock, tmp);
+	    /* Add current gfc_code to pblock.  */
+	    gfc_add_expr_to_block (pblock, res);
+	    singleblock_inprogress = false;
+	  }
+	}
+	else 
+	{
+	  if (ws_data.curr_singleunit)
+	  {
+	    /* Start single block.  */
+	    gfc_init_block (&singleblock);
+	    gfc_add_expr_to_block (&singleblock, res);
+	    singleblock_inprogress = true;
+	  }
+	  else
+	    /* Add the new statement to the block.  */
+	    gfc_add_expr_to_block (pblock, res);
+	}
+	prev_singleunit = ws_data.curr_singleunit;
+      }
+    }
+  /* Finish remaining SINGLE block, if we were in the middle of one.  */
+  if (singleblock_inprogress)
+    {
+      /* Finish single block and add it to pblock.  */
+      tmp = gfc_finish_block (&singleblock);
+      tmp = build2_v (OMP_SINGLE, tmp, omp_clauses);
+      gfc_add_expr_to_block (pblock, tmp);
+      singleblock_inprogress = false;
+    }
+
+  stmt = gfc_finish_block (pblock);
+  if (TREE_CODE (stmt) != BIND_EXPR)
+    {
+      if (!IS_EMPTY_STMT (stmt))
+	{
+	  tree bindblock = poplevel (1, 0, 0);
+	  stmt = build3_v (BIND_EXPR, NULL, stmt, bindblock);
+	}
+      else
+	poplevel (0, 0, 0);
+    }
+  else
+    poplevel (0, 0, 0);
+
+  ws_data.workshareflag = false;
+  return stmt;
 }
 
 tree
Index: gcc/fortran/trans-stmt.c
===================================================================
--- gcc/fortran/trans-stmt.c	(revision 138760)
+++ gcc/fortran/trans-stmt.c	(working copy)
@@ -3585,6 +3585,8 @@
   gfc_ss *edss = 0;
   gfc_ss *esss = 0;
 
+  /* Allow the scalarizer to workshare simple where loops.  */
+  if (ws_data.workshareflag) ws_data.scalarizer_workshare = true;
   cond = cblock->expr;
   tdst = cblock->next->expr;
   tsrc = cblock->next->expr2;
Index: gcc/fortran/trans.c
===================================================================
--- gcc/fortran/trans.c	(revision 138760)
+++ gcc/fortran/trans.c	(working copy)
@@ -1192,6 +1192,7 @@
 void
 gfc_generate_code (gfc_namespace * ns)
 {
+  memset (&ws_data, 0, sizeof (ws_data));
   if (ns->is_block_data)
     {
       gfc_generate_block_data (ns);
Index: gcc/fortran/trans.h
===================================================================
--- gcc/fortran/trans.h	(revision 138760)
+++ gcc/fortran/trans.h	(working copy)
@@ -743,5 +743,14 @@
 extern const char gfc_msg_fault[];
 extern const char gfc_msg_wrong_return[];
 
+typedef struct omp_workshare_data {
+  bool workshareflag; /* True if in a workshare construct.  */
+  bool curr_singleunit; /* True if current gfc_code in workshare construct 
+                           is not workshared.  */
+  bool scalarizer_workshare; /* True if scalarizer should workshare loops.  */
+  tree for_clauses;  
+}
+omp_workshare_data;
 
+extern omp_workshare_data ws_data;
 #endif /* GFC_TRANS_H */

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2008-08-07 23:58 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-06-28 18:00 [PATCH/ fortran] PR35423 omp workshare - first patch Vasilis Liaskovitis
2008-07-30 21:43 ` Jakub Jelinek
2008-08-08  6:12   ` Vasilis Liaskovitis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).