public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass
@ 2015-07-14  5:46 Chung-Lin Tang
  2015-07-14  7:00 ` Jakub Jelinek
  2015-07-14  9:36 ` [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass Thomas Schwinge
  0 siblings, 2 replies; 7+ messages in thread
From: Chung-Lin Tang @ 2015-07-14  5:46 UTC (permalink / raw)
  To: gcc-patches, Tom de Vries; +Cc: Thomas Schwinge, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1174 bytes --]

Hi Tom,
this patch provides a 'bool independent' field in struct loop, which
will be switched on by an "independent" clause in a #pragma acc loop directive.
I assume you'll be wiring it to the kernels parloops pass in a followup patch.

Note: there are already a few other similar fields in struct loop, namely
'safelen' and 'can_be_parallel', used by OMP simd safelen and GRAPHITE respectively.
The intention and/or setting of these fields are all a bit different, so I've
decided to add a new bool for OpenACC.

Tested and committed to gomp-4_0-branch.

Chung-Lin

2015-07-14  Chung-Lin Tang  <cltang@codesourcery.com>

        * cfgloop.h (struct loop): Add 'bool marked_independent' field.
        * gimplify.c (gimplify_scan_omp_clauses): Keep OMP_CLAUSE_INDEPENDENT.
        * omp-low.c (struct omp_region): Add 'int kind' and
        'bool independent' fields.
        (expand_omp_for): Set 'marked_independent' field for loop
        corresponding to region.
        (find_omp_for_region_data): New function.
        (find_omp_target_region_data): Set kind field.
        (build_omp_regions_1): Call find_omp_for_region_data() for
        GIMPLE_OMP_FOR statements.

[-- Attachment #2: acc-loop-independent.patch --]
[-- Type: text/x-patch, Size: 4392 bytes --]

Index: cfgloop.h
===================================================================
--- cfgloop.h	(revision 225758)
+++ cfgloop.h	(working copy)
@@ -194,6 +194,10 @@ struct GTY ((chain_next ("%h.next"))) loop {
   /* True if the loop is part of an oacc kernels region.  */
   bool in_oacc_kernels_region;
 
+  /* True if loop is tagged as having independent iterations by user,
+     e.g. the OpenACC independent clause.  */
+  bool marked_independent;
+
   /* For SIMD loops, this is a unique identifier of the loop, referenced
      by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE
      builtins.  */
Index: gimplify.c
===================================================================
--- gimplify.c	(revision 225758)
+++ gimplify.c	(working copy)
@@ -6602,7 +6602,6 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_se
 	  break;
 
 	case OMP_CLAUSE_DEVICE_RESIDENT:
-	case OMP_CLAUSE_INDEPENDENT:
 	  remove = true;
 	  break;
 
@@ -6612,6 +6611,7 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_se
 	case OMP_CLAUSE_COLLAPSE:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_INDEPENDENT:
 	case OMP_CLAUSE_MERGEABLE:
 	case OMP_CLAUSE_PROC_BIND:
 	case OMP_CLAUSE_SAFELEN:
Index: omp-low.c
===================================================================
--- omp-low.c	(revision 225758)
+++ omp-low.c	(working copy)
@@ -136,8 +136,16 @@ struct omp_region
   /* True if this is nested inside an OpenACC kernels construct.  */
   bool inside_kernels_p;
 
+  /* Records a generic kind field.  */
+  int kind;
+
   /* For an OpenACC loop, the level of parallelism requested.  */
   int gwv_this;
+
+  /* For an OpenACC loop directive, true if has the 'independent' clause.  */
+  bool independent;
+
+  tree broadcast_array;
 };
 
 /* Context structure.  Used to store information about each parallel
@@ -8273,8 +8281,15 @@ expand_omp_for (struct omp_region *region, gimple
     loops_state_set (LOOPS_NEED_FIXUP);
 
   if (region->inside_kernels_p)
-    expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
-			    inner_stmt);
+    {
+      expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+			      inner_stmt);
+      if (region->independent && region->cont->loop_father)
+	{
+	  struct loop *loop = region->cont->loop_father; 
+	  loop->marked_independent = true;
+	}
+    }
   else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
     expand_omp_simd (region, &fd);
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_CILKFOR)
@@ -9943,6 +9958,34 @@ find_omp_for_region_gwv (gimple stmt)
   return tmp;
 }
 
+static void
+find_omp_for_region_data (struct omp_region *region, gomp_for *stmt)
+{
+  region->gwv_this = find_omp_for_region_gwv (stmt);
+  region->kind = gimple_omp_for_kind (stmt);
+
+  if (region->kind == GF_OMP_FOR_KIND_OACC_LOOP)
+    {
+      struct omp_region *target_region = region->outer;
+      while (target_region
+	     && target_region->type != GIMPLE_OMP_TARGET)
+	target_region = target_region->outer;
+      if (!target_region)
+	return;
+
+      tree clauses = gimple_omp_for_clauses (stmt);
+
+      if (target_region->kind == GF_OMP_TARGET_KIND_OACC_PARALLEL
+	  && !find_omp_clause (clauses, OMP_CLAUSE_SEQ))
+	/* In OpenACC parallel constructs, 'independent' is implied on all
+	   loop directives without a 'seq' clause.  */
+	region->independent = true;
+      else if (target_region->kind == GF_OMP_TARGET_KIND_OACC_KERNELS
+	       && find_omp_clause (clauses, OMP_CLAUSE_INDEPENDENT))
+	region->independent = true;
+    }
+}
+
 /* Fill in additional data for a region REGION associated with an
    OMP_TARGET STMT.  */
 
@@ -9960,6 +10003,7 @@ find_omp_target_region_data (struct omp_region *re
     region->gwv_this |= OACC_LOOP_MASK (OACC_worker);
   if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH))
     region->gwv_this |= OACC_LOOP_MASK (OACC_vector);
+  region->kind = gimple_omp_target_kind (stmt);
 }
 
 /* Helper for build_omp_regions.  Scan the dominator tree starting at
@@ -10046,7 +10090,7 @@ build_omp_regions_1 (basic_block bb, struct omp_re
 		}
 	    }
 	  else if (code == GIMPLE_OMP_FOR)
-	    region->gwv_this = find_omp_for_region_gwv (stmt);
+	    find_omp_for_region_data (region, as_a <gomp_for *> (stmt));
 	  /* ..., this directive becomes the parent for a new region.  */
 	  if (region)
 	    parent = region;

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass
  2015-07-14  5:46 [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass Chung-Lin Tang
@ 2015-07-14  7:00 ` Jakub Jelinek
  2015-07-14  9:35   ` Chung-Lin Tang
  2015-07-14  9:36 ` [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass Thomas Schwinge
  1 sibling, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2015-07-14  7:00 UTC (permalink / raw)
  To: Chung-Lin Tang; +Cc: gcc-patches, Tom de Vries, Thomas Schwinge

On Tue, Jul 14, 2015 at 01:46:04PM +0800, Chung-Lin Tang wrote:
> this patch provides a 'bool independent' field in struct loop, which
> will be switched on by an "independent" clause in a #pragma acc loop directive.
> I assume you'll be wiring it to the kernels parloops pass in a followup patch.
> 
> Note: there are already a few other similar fields in struct loop, namely
> 'safelen' and 'can_be_parallel', used by OMP simd safelen and GRAPHITE respectively.
> The intention and/or setting of these fields are all a bit different, so I've
> decided to add a new bool for OpenACC.

How is it different though?  Can you cite exact definition of the
independent clause vs. safelen (set to INT_MAX)?
The OpenMP definition is:
"A SIMD loop has logical iterations numbered 0,1,...,N-1 where N is the
number of loop iterations, and the logical numbering denotes the sequence in which the iterations would
be executed if the associated loop(s) were executed with no SIMD instructions. If the safelen
clause is used then no two iterations executed concurrently with SIMD instructions can have a
greater distance in the logical iteration space than its value."
...
"Lexical forward dependencies in the iterations of the
original loop must be preserved within each SIMD chunk."

So e.g. safelen >= 32 means for PTX you can safely implement it by
running up to 32 consecutive iterations by all threads in the warp
(assuming code that for some reason must be run by a single thread
(e.g. calls to functions that are marked so that they expect to be run
by the first thread in a warp initially) is run sequentially by increasing
iterator), but it doesn't mean the iterations have no dependencies in between
them whatsoever (see the above note about lexical forward dependencies),
so you can't parallelize it by assigning different iterations to different
threads outside of warp (or pthread_create created threads).
So if OpenACC independent means there are no dependencies in between
iterations, the OpenMP counterpart here is #pragma omp for simd schedule (auto)
or #pragma omp distribute parallel for simd schedule (auto).

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass
  2015-07-14  7:00 ` Jakub Jelinek
@ 2015-07-14  9:35   ` Chung-Lin Tang
  2015-07-14  9:49     ` Jakub Jelinek
  0 siblings, 1 reply; 7+ messages in thread
From: Chung-Lin Tang @ 2015-07-14  9:35 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches, Tom de Vries, Thomas Schwinge

On 15/7/14 3:00 PM, Jakub Jelinek wrote:
> On Tue, Jul 14, 2015 at 01:46:04PM +0800, Chung-Lin Tang wrote:
>> this patch provides a 'bool independent' field in struct loop, which
>> will be switched on by an "independent" clause in a #pragma acc loop directive.
>> I assume you'll be wiring it to the kernels parloops pass in a followup patch.
>>
>> Note: there are already a few other similar fields in struct loop, namely
>> 'safelen' and 'can_be_parallel', used by OMP simd safelen and GRAPHITE respectively.
>> The intention and/or setting of these fields are all a bit different, so I've
>> decided to add a new bool for OpenACC.
> 
> How is it different though?  Can you cite exact definition of the
> independent clause vs. safelen (set to INT_MAX)?
> The OpenMP definition is:
> "A SIMD loop has logical iterations numbered 0,1,...,N-1 where N is the
> number of loop iterations, and the logical numbering denotes the sequence in which the iterations would
> be executed if the associated loop(s) were executed with no SIMD instructions. If the safelen
> clause is used then no two iterations executed concurrently with SIMD instructions can have a
> greater distance in the logical iteration space than its value."
> ...
> "Lexical forward dependencies in the iterations of the
> original loop must be preserved within each SIMD chunk."

The wording of OpenACC independent is more simple:
"... the independent clause tells the implementation that the iterations of this loop
are data-independent with respect to each other." -- OpenACC spec 2.7.9

I would say this implies even more relaxed conditions than OpenMP simd safelen,
essentially saying that the compiler doesn't even need dependence analysis; just
assume independence of iterations.

> So e.g. safelen >= 32 means for PTX you can safely implement it by
> running up to 32 consecutive iterations by all threads in the warp
> (assuming code that for some reason must be run by a single thread
> (e.g. calls to functions that are marked so that they expect to be run
> by the first thread in a warp initially) is run sequentially by increasing
> iterator), but it doesn't mean the iterations have no dependencies in between
> them whatsoever (see the above note about lexical forward dependencies),
> so you can't parallelize it by assigning different iterations to different
> threads outside of warp (or pthread_create created threads).

> So if OpenACC independent means there are no dependencies in between
> iterations, the OpenMP counterpart here is #pragma omp for simd schedule (auto)
> or #pragma omp distribute parallel for simd schedule (auto).

schedule(auto) appears to correspond to the OpenACC 'auto' clause, or
what is implied in a kernels compute construct, but I'm not sure it implies
no dependencies between iterations?

Putting aside the semantic issues, as of currently safelen>0 turns on a certain amount of
vectorization code that we are not currently using (and not likely at all for nvptx).
Right now, we're just trying to pass the new flag to a kernels tree-parloops based pass.

Maybe this can all be reconciled later in a more precise way, e.g. have flags that correspond
specifically to phases of internal compiler passes (and selected by needs of the accel target),
instead of ones that are "sort of" associated with high-level language features.

Chung-Lin

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass
  2015-07-14  5:46 [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass Chung-Lin Tang
  2015-07-14  7:00 ` Jakub Jelinek
@ 2015-07-14  9:36 ` Thomas Schwinge
  2015-07-14 11:05   ` Thomas Schwinge
  1 sibling, 1 reply; 7+ messages in thread
From: Thomas Schwinge @ 2015-07-14  9:36 UTC (permalink / raw)
  To: Chung-Lin Tang; +Cc: Jakub Jelinek, gcc-patches, Tom de Vries

[-- Attachment #1: Type: text/plain, Size: 2620 bytes --]

Hi Chung-Lin!

On Tue, 14 Jul 2015 13:46:04 +0800, Chung-Lin Tang <cltang@codesourcery.com> wrote:
> this patch provides a 'bool independent' field in struct loop, which
> will be switched on by an "independent" clause in a #pragma acc loop directive.

Thanks!


This patch has been developed in context of OpenACC kernels constructs,
but, is there anything still to be done regarding OpenACC parallel
constructs?  That is, are we currently *using* the "independent yes/no"
information appropriately for these?


>         * omp-low.c (struct omp_region): Add 'int kind' and
>         'bool independent' fields.

> --- omp-low.c	(revision 225758)
> +++ omp-low.c	(working copy)
> @@ -136,8 +136,16 @@ struct omp_region
>    /* True if this is nested inside an OpenACC kernels construct.  */
>    bool inside_kernels_p;
>  
> +  /* Records a generic kind field.  */
> +  int kind;
> +
>    /* For an OpenACC loop, the level of parallelism requested.  */
>    int gwv_this;
> +
> +  /* For an OpenACC loop directive, true if has the 'independent' clause.  */
> +  bool independent;
> +
> +  tree broadcast_array;
>  };

I'm assuming just a patch conflict resolution hiccup; committed in
r225767:

commit 4a10c97b741bbc3d7278779337d5c0bfea8297c2
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Tue Jul 14 09:30:02 2015 +0000

    Code cleanup
    
    	gcc/
    	* omp-low.c (struct omp_region): Remove broadcast_array member.
    
    ... that mistakenly reappeared in r225759, after having been removed in
    r225647.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225767 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/ChangeLog.gomp | 4 ++++
 gcc/omp-low.c      | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git gcc/ChangeLog.gomp gcc/ChangeLog.gomp
index dfe0c95..d7459d0 100644
--- gcc/ChangeLog.gomp
+++ gcc/ChangeLog.gomp
@@ -1,3 +1,7 @@
+2015-07-14  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* omp-low.c (struct omp_region): Remove broadcast_array member.
+
 2015-07-14  Tom de Vries  <tom@codesourcery.com>
 
 	* tree-parloops.c (parallelize_loops): Use marked_independent flag in
diff --git gcc/omp-low.c gcc/omp-low.c
index fc6c7a9..0419dcd 100644
--- gcc/omp-low.c
+++ gcc/omp-low.c
@@ -144,8 +144,6 @@ struct omp_region
 
   /* For an OpenACC loop directive, true if has the 'independent' clause.  */
   bool independent;
-
-  tree broadcast_array;
 };
 
 /* Context structure.  Used to store information about each parallel


Grüße,
 Thomas

[-- Attachment #2: Type: application/pgp-signature, Size: 472 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass
  2015-07-14  9:35   ` Chung-Lin Tang
@ 2015-07-14  9:49     ` Jakub Jelinek
  2015-07-22 17:06       ` [gomp4, committed] Set safelen to INT_MAX for oacc independent pragma Tom de Vries
  0 siblings, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2015-07-14  9:49 UTC (permalink / raw)
  To: Chung-Lin Tang; +Cc: gcc-patches, Tom de Vries, Thomas Schwinge

On Tue, Jul 14, 2015 at 05:35:28PM +0800, Chung-Lin Tang wrote:
> The wording of OpenACC independent is more simple:
> "... the independent clause tells the implementation that the iterations of this loop
> are data-independent with respect to each other." -- OpenACC spec 2.7.9
> 
> I would say this implies even more relaxed conditions than OpenMP simd safelen,
> essentially saying that the compiler doesn't even need dependence analysis; just
> assume independence of iterations.

safelen is also saying that the compiler doesn't even need dependence
analysis.  It is just that only some transformations of the loop are ok
without dependence analysis, others need to be with dependence analysis.
Classical vectorization optimizations (instead of doing one iteration
at a time you can do up to safelen consecutive iterations together) for the
first statement in the loop, then second statement, etc. are ok without
dependence analysis, but e.g. reversing the loop and running first the last
iteration and so on up to first, or running the iterations in random orders
is not ok.

> > So if OpenACC independent means there are no dependencies in between
> > iterations, the OpenMP counterpart here is #pragma omp for simd schedule (auto)
> > or #pragma omp distribute parallel for simd schedule (auto).
> 
> schedule(auto) appears to correspond to the OpenACC 'auto' clause, or
> what is implied in a kernels compute construct, but I'm not sure it implies
> no dependencies between iterations?

By the schedule(auto) I meant that the user tells the compiler it can
parallelize the loop with whatever schedule it wants.  Other schedules are
quite well defined, if the team has that many threads, which of the thread
gets which iteration, so user could rely on a particular parallelization and
the loop iterations still could not be 100% independent.  With
schedule(auto) you say it is up to the compiler to schedule them, thus they
really have to be all independent.

> Putting aside the semantic issues, as of currently safelen>0 turns on a certain amount of
> vectorization code that we are not currently using (and not likely at all for nvptx).
> Right now, we're just trying to pass the new flag to a kernels tree-parloops based pass.

In any case, when setting your flag you should also set safelen = INT_MAX,
as the OpenACC independent implies that you can vectorize the loop with any
vectorization factor without performing dependency analysis on the loop.
OpenACC is (hopefully) not just about PTX and most other targets will want
to vectorize such loops.

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass
  2015-07-14  9:36 ` [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass Thomas Schwinge
@ 2015-07-14 11:05   ` Thomas Schwinge
  0 siblings, 0 replies; 7+ messages in thread
From: Thomas Schwinge @ 2015-07-14 11:05 UTC (permalink / raw)
  To: Chung-Lin Tang; +Cc: Jakub Jelinek, gcc-patches, Tom de Vries

[-- Attachment #1: Type: text/plain, Size: 4861 bytes --]

Hi!

On Tue, 14 Jul 2015 11:36:24 +0200, I wrote:
> On Tue, 14 Jul 2015 13:46:04 +0800, Chung-Lin Tang <cltang@codesourcery.com> wrote:
> > this patch provides a 'bool independent' field in struct loop, which
> > will be switched on by an "independent" clause in a #pragma acc loop directive.
> 
> Thanks!
> 
> 
> This patch has been developed in context of OpenACC kernels constructs,
> but, is there anything still to be done regarding OpenACC parallel
> constructs?  That is, are we currently *using* the "independent yes/no"
> information appropriately for these?

Tom mentioned:

| openacc spec:
| ...
| 2.7.9 independent clause
| In a kernels construct, the independent clause tells the implementation 
| that the iterations of this loop are data-independent with respect to 
| each other. This allows the implementation to generate code to execute 
| the iterations in parallel with no synchronization.
| In a parallel construct, the independent clause is implied on all loop 
| directives without a seq clause.
| ...
|
| I think you're sort of asking if the seq clause has been implemented.
|
| openacc spec:
| ...
| 2.7.5 seq clause
| The seq clause specifies that the associated loop or loops are to be 
| executed sequentially by the accelerator. This clause will override any 
| automatic parallelization or vectorization.
| ...

Thanks, and right, I also realized that.  ;-)

Yet, my request is still to properly "document" this.  That is, if the
idea is that gcc/omp-low.c:scan_omp_for makes sure to emit a diagnostic
for the invalid combination of a gang/worker/vector clause together with
a seq clause, then in combination with the code newly added to
gcc/omp-low.c:find_omp_for_region_data to set region->independent = true
inside OpenACC parallel constructs, can't we then
assert(region->independent == true) in expand_omp_for_static_chunk and
expand_omp_for_static_nochunk?

So far it looks to me as if for OpenACC parallel constructs, we only have
a producer of region->independent = true, but no consumer.  Even if the
latter one is "implicit", we should "document" (using an assertion) this
in some way, for clarity.

While looking at that code, are we convinced that the diagnostic
machinery and subsequent handling will do the right thing in such cases:

    [...]
    #pragma acc loop [gang/worker/vector]
      for [...]
    #pragma acc loop seq
        for[...]

To me it looks (but I have not verified) that in such cases, the inner
region's ctx->gwv_this will have been initialized from the outer_ctx, so
to some combination of gang/worker/vector, and will that then be used to
parallelize the inner loop, which shouldn't be done, as I'm understanding
this?

It seems to be as if this gang/worker/vector/seq/independent clause
handling code that is currently distributed over
gcc/omp-low.c:scan_omp_for and gcc/omp-low.c:find_omp_for_region_data
(and, worse, also the front ends; see below) should be merged into one
place.

gcc/fortran/openmp.c:resolve_oacc_loop_blocks emits a diagnostic for seq
with independent; gcc/omp-low.c:scan_omp_for doesn't.  Should it?  Then,
why do we need this Fortran-specific checking code?  Should the
additional checking being done there get moved to OMP lowering?

In the C and C++ front ends, there's related checking code for those
clause attached to OpenACC routine constructs; not sure if that checking
should also be handled during OMP lowering, in one place?  I couldn't
find similar code in the Fortran front end (but didn't look very hard).

This is reminiscent of the discussion started in
<http://news.gmane.org/find-root.php?message_id=%3C87mw0zirnq.fsf%40schwinge.name%3E>
and following messages, about using
gcc/omp-low.c:check_omp_nesting_restrictions to do such checking (which
Cesar has not yet followed up on, as far as I know).

A few more points:

Does OMP_CLAUSE_INDEPENDENT need to be handled in
gcc/c-family/c-omp.c:c_oacc_split_loop_clauses?

While looking at that, ;-) again a few more things...  Are others clauses
missing to be handled there: tile, device_type (probably not; has all
been processed before?)?  Why is the firstprivate clause being duplicated
for loop_clauses/non_loop_clauses?  In OpenACC, there is no firstprivate
clause for loop constructs.  Why is the private clause being duplicated
for loop_clauses/non_loop_clauses?  My understanding is that the private
clause in a combined OpenACC parallel loop construct is to be handled as
if it were attached to the loop construct (... which is in contrast to
firstprivate -- yes OpenACC 2.0a is a bit assymetric in that regard).

What about the corresponding Fortran code,
gcc/fortran/trans-openmp.c:gfc_trans_oacc_combined_directive?

Do we have test cases to cover all this?


Grüße,
 Thomas

[-- Attachment #2: Type: application/pgp-signature, Size: 472 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [gomp4, committed] Set safelen to INT_MAX for oacc independent pragma
  2015-07-14  9:49     ` Jakub Jelinek
@ 2015-07-22 17:06       ` Tom de Vries
  0 siblings, 0 replies; 7+ messages in thread
From: Tom de Vries @ 2015-07-22 17:06 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Chung-Lin Tang, gcc-patches, Tom de Vries, Thomas Schwinge

[-- Attachment #1: Type: text/plain, Size: 2968 bytes --]

[ was; Re: [PATCH, gomp4] Propagate independent clause for OpenACC 
kernels pass ]

On 14/07/15 11:48, Jakub Jelinek wrote:
> On Tue, Jul 14, 2015 at 05:35:28PM +0800, Chung-Lin Tang wrote:
>> The wording of OpenACC independent is more simple:
>> "... the independent clause tells the implementation that the iterations of this loop
>> are data-independent with respect to each other." -- OpenACC spec 2.7.9
>>
>> I would say this implies even more relaxed conditions than OpenMP simd safelen,
>> essentially saying that the compiler doesn't even need dependence analysis; just
>> assume independence of iterations.
>
> safelen is also saying that the compiler doesn't even need dependence
> analysis.  It is just that only some transformations of the loop are ok
> without dependence analysis, others need to be with dependence analysis.
> Classical vectorization optimizations (instead of doing one iteration
> at a time you can do up to safelen consecutive iterations together) for the
> first statement in the loop, then second statement, etc. are ok without
> dependence analysis, but e.g. reversing the loop and running first the last
> iteration and so on up to first, or running the iterations in random orders
> is not ok.
>
>>> So if OpenACC independent means there are no dependencies in between
>>> iterations, the OpenMP counterpart here is #pragma omp for simd schedule (auto)
>>> or #pragma omp distribute parallel for simd schedule (auto).
>>
>> schedule(auto) appears to correspond to the OpenACC 'auto' clause, or
>> what is implied in a kernels compute construct, but I'm not sure it implies
>> no dependencies between iterations?
>
> By the schedule(auto) I meant that the user tells the compiler it can
> parallelize the loop with whatever schedule it wants.  Other schedules are
> quite well defined, if the team has that many threads, which of the thread
> gets which iteration, so user could rely on a particular parallelization and
> the loop iterations still could not be 100% independent.  With
> schedule(auto) you say it is up to the compiler to schedule them, thus they
> really have to be all independent.
>
>> Putting aside the semantic issues, as of currently safelen>0 turns on a certain amount of
>> vectorization code that we are not currently using (and not likely at all for nvptx).
>> Right now, we're just trying to pass the new flag to a kernels tree-parloops based pass.
>
> In any case, when setting your flag you should also set safelen = INT_MAX,
> as the OpenACC independent implies that you can vectorize the loop with any
> vectorization factor without performing dependency analysis on the loop.
> OpenACC is (hopefully) not just about PTX and most other targets will want
> to vectorize such loops.
>

This patch sets safelen to INT_MAX for loops marked with the independent 
clause on the openacc loop directive.

Build and reg-tested on x86_64 with nvidia accelerator.

Committed to gomp-4_0-branch.

Thanks,
- Tom


[-- Attachment #2: 0001-Set-safelen-to-INT_MAX-for-oacc-independent-pragma.patch --]
[-- Type: text/x-patch, Size: 650 bytes --]

Set safelen to INT_MAX for oacc independent pragma

2015-07-22  Tom de Vries  <tom@codesourcery.com>

	* omp-low.c (expand_omp_for): Set loop->safelen to INT_MAX if
	marked_independent.
---
 gcc/omp-low.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 0419dcd..65c6321 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -8286,6 +8286,7 @@ expand_omp_for (struct omp_region *region, gimple inner_stmt)
 	{
 	  struct loop *loop = region->cont->loop_father; 
 	  loop->marked_independent = true;
+	  loop->safelen = INT_MAX;
 	}
     }
   else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-07-22 17:05 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-14  5:46 [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass Chung-Lin Tang
2015-07-14  7:00 ` Jakub Jelinek
2015-07-14  9:35   ` Chung-Lin Tang
2015-07-14  9:49     ` Jakub Jelinek
2015-07-22 17:06       ` [gomp4, committed] Set safelen to INT_MAX for oacc independent pragma Tom de Vries
2015-07-14  9:36 ` [PATCH, gomp4] Propagate independent clause for OpenACC kernels pass Thomas Schwinge
2015-07-14 11:05   ` Thomas Schwinge

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).