public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [patch, fortran, RFC] Interchange indices for FORALL and DO CONCURRENT if profitable
@ 2017-10-27 22:39 Thomas Koenig
  2017-10-27 22:39 ` Steve Kargl
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Thomas Koenig @ 2017-10-27 22:39 UTC (permalink / raw)
  To: fortran, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4225 bytes --]

Hello world,

this is a draft patch which interchanges the indices for FORALL and
DO CONCURRENT loops for cases like PR 82471, where code like

   DO CONCURRENT( K=1:N, J=1:M, I=1:L)
      C(I,J,K) = A(I,J,K) + B(I,J,K)
   END DO

led to very poor code because of stride issues.  Currently,
Graphite is not able to do this.

Without the patch, the code above is translated as


         i.7 = 1;
         count.10 = 512;
         while (1)
           {
             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
             j.6 = 1;
             count.9 = 512;
             while (1)
               {
                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                 k.5 = 1;
                 count.8 = 512;
                 while (1)
                   {
                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
                     (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
+ (integer(kind=8)) k.5 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
c.dim[1].stride) + (integer(kind=8)) i.7] = (*(real(kind=4)[0:] * 
restrict) a.data)[((a.offset + (integer(kind=8)) k.5 * a.dim[2].stride) 
+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.7] + 
(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
k.5 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
(integer(kind=8)) i.7];
                     L.1:;
                     k.5 = k.5 + 1;
                     count.8 = count.8 + -1;
                   }
                 L.2:;
                 j.6 = j.6 + 1;
                 count.9 = count.9 + -1;
               }
             L.3:;
             i.7 = i.7 + 1;
             count.10 = count.10 + -1;
           }
         L.4:;

so the innermost loop has the biggest stride. With the patch
(and front-end optimization turned on), this is turned into

         k.7 = 1;
         count.10 = 512;
         while (1)
           {
             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
             j.6 = 1;
             count.9 = 512;
             while (1)
               {
                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                 i.5 = 1;
                 count.8 = 512;
                 while (1)
                   {
                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
                     (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
+ (integer(kind=8)) k.7 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
c.dim[1].stride) + (integer(kind=8)) i.5] = (*(real(kind=4)[0:] * 
restrict) a.data)[((a.offset + (integer(kind=8)) k.7 * a.dim[2].stride) 
+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.5] + 
(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
k.7 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
(integer(kind=8)) i.5];
                     L.1:;
                     i.5 = i.5 + 1;
                     count.8 = count.8 + -1;
                   }
                 L.2:;
                 j.6 = j.6 + 1;
                 count.9 = count.9 + -1;
               }
             L.3:;
             k.7 = k.7 + 1;
             count.10 = count.10 + -1;
           }
         L.4:;

so the innermost loop is the one that gets traversed with unity stride
(the way it should have been done).

Although the algorithm here is quite simple, it resolves the issues
raised in the PR so far, and it definitely worth fixing.

If we do this kind of thing, it might also be possible to
interchange normal DO loops in a similar way (which Graphite also
cannot do at the moment, at least not if the bounds are variables).

So, comments? Suggestions? Other ideas? Any ideas how to write
a test case? Any volunteers to re-implement Graphite in the
Fortran front end (didn't think so) or to make Graphite catch
this sort of pattern (which it currently doesn't) instead?

Regards

	Thomas

2017-10-27  Thomas Koenig  <tkoenig@gcc.gnu.org>

         * frontend-passes.c (index_interchange): New funciton,
         prototype.
         (optimize_namespace): Call index_interchange.
         (ind_type): New function.
         (has_var): New function.
         (index_cost): New function.
         (loop_comp): New function.

[-- Attachment #2: p2.diff --]
[-- Type: text/x-patch, Size: 4592 bytes --]

Index: frontend-passes.c
===================================================================
--- frontend-passes.c	(Revision 253872)
+++ frontend-passes.c	(Arbeitskopie)
@@ -55,6 +55,7 @@ static gfc_expr* check_conjg_transpose_variable (g
 						 bool *);
 static bool has_dimen_vector_ref (gfc_expr *);
 static int matmul_temp_args (gfc_code **, int *,void *data);
+static int index_interchange (gfc_code **, int*, void *);
 
 #ifdef CHECKING_P
 static void check_locus (gfc_namespace *);
@@ -1385,6 +1386,9 @@ optimize_namespace (gfc_namespace *ns)
 		       NULL);
     }
 
+  gfc_code_walker (&ns->code, index_interchange, dummy_expr_callback,
+		   NULL);
+
   /* BLOCKs are handled in the expression walker below.  */
   for (ns = ns->contained; ns; ns = ns->sibling)
     {
@@ -4225,6 +4229,157 @@ inline_matmul_assign (gfc_code **c, int *walk_subt
   return 0;
 }
 
+
+/* Code for index interchange for loops which are grouped together in DO
+   CONCURRENT or FORALL statements.  This is currently only applied if the
+   iterations are grouped together in a single statement.
+
+   For this transformation, tt is assumed that memory access in strides is
+   expensive, and that loops which access later indices (which access memory
+   in bigger strides) shoud be moved to the first loops.
+
+   For this, a loop over all the statements is executed, counting the times
+   that the loop iteration values are acessed in each index.  The loop
+   indices are then sorted to minimize access to later indces from inner
+   loops.  */
+
+/* Type for holding index information.  */
+
+typepedef struct {
+  gfc_symbol *sym;
+  gfc_forall_iterator *fa;
+  int num;
+  int n[GFC_MAX_DIMENSIONS];
+} ind_type;
+
+/* Callback function to determine if an expression is the 
+   corresponding variable.  */
+
+static int
+has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data)
+{
+  gfc_expr *expr = *e;
+  gfc_symbol *sym;
+
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  sym = (gfc_symbol *) data;
+  return sym == expr->symtree->n.sym;
+}
+
+/* Callback function to calculate the cost of a certain index.  */
+
+static int
+index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
+	    void *data)
+{
+  ind_type *ind;
+  gfc_expr *expr;
+  gfc_array_ref *ar;
+  gfc_ref *ref;
+  int i,j;
+
+  expr = *e;
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  ar = NULL;
+  for (ref = expr->ref; ref; ref = ref->next)
+    {
+      if (ref->type == REF_ARRAY)
+	{
+	  ar = &ref->u.ar;
+	  break;
+	}
+    }
+  if (ar == NULL || ar->type != AR_ELEMENT)
+    return 0;
+
+  ind = (ind_type *) data;
+  for (i = 0; i < ar->dimen; i++)
+    {
+      for (j=0; ind[j].sym != NULL; j++)
+	{
+	  if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym)))
+	      ind[j].n[i]++;
+	}
+    }
+  return 0;
+}
+
+/* Callback function for qsort, to sort the loop indices. */
+
+static int
+loop_comp (const void *e1, const void *e2)
+{
+  const ind_type *i1 = (const ind_type *) e1;
+  const ind_type *i2 = (const ind_type *) e2;
+  int i;
+
+  for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--)
+    {
+      if (i1->n[i] != i2->n[i])
+	return i1->n[i] - i2->n[i];
+    }
+  /* All other things being equal, let's not change the ordering.  */
+  return i2->num - i1->num;
+}
+
+/* Main function to do the index interchange.  */
+
+static int
+index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
+		  void *data ATTRIBUTE_UNUSED)
+{
+  gfc_code *co;
+  co = *c;
+  int n_iter;
+  gfc_forall_iterator *fa;
+  ind_type *ind;
+  int i, j;
+  
+  if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT)
+    return 0;
+
+  n_iter = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    n_iter ++;
+
+  /* Nothing to reorder. */
+  if (n_iter < 2)
+    return 0;
+
+  ind = XALLOCAVEC (ind_type, n_iter + 1);
+
+  i = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    {
+      ind[i].sym = fa->var->symtree->n.sym;
+      ind[i].fa = fa;
+      for (j=0; j<GFC_MAX_DIMENSIONS; j++)
+	ind[i].n[j] = 0;
+      ind[i].num = i;
+      i++;
+    }
+  ind[n_iter].sym = NULL;
+  ind[n_iter].fa = NULL;
+
+  gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind);
+  qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp);
+
+  /* Do the actual index interchange.  */
+  co->ext.forall_iterator = fa = ind[0].fa;
+  for (i=1; i<n_iter; i++)
+    {
+      fa->next = ind[i].fa;
+      fa = fa->next;
+    }
+  fa->next = NULL;
+
+  return 0;
+}
+
 #define WALK_SUBEXPR(NODE) \
   do							\
     {							\

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-10-30 20:40 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-27 22:39 [patch, fortran, RFC] Interchange indices for FORALL and DO CONCURRENT if profitable Thomas Koenig
2017-10-27 22:39 ` Steve Kargl
2017-10-28 12:54   ` Thomas Koenig
2017-10-30 20:50     ` Steve Kargl
2017-10-28 17:15 ` Richard Biener
2017-10-29 17:28 ` Thomas Koenig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).