[gcc r11-3691] openmp: Improve composite simd vectorization

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r11-3691] openmp: Improve composite simd vectorization
@ 2020-10-07  8:51 Jakub Jelinek
  0 siblings, 0 replies; only message in thread
From: Jakub Jelinek @ 2020-10-07  8:51 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:83f565ed4f37e550e1d40f7b6cf0b5845f29a9c7

commit r11-3691-g83f565ed4f37e550e1d40f7b6cf0b5845f29a9c7
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Wed Oct 7 10:49:37 2020 +0200

    openmp: Improve composite simd vectorization
    
    > > I was really hoping bbs 4 and 5 would be one loop (the one I set safelen
    > > and force_vectorize etc. for) and that basic blocks 6 and 7 would be
    > > together with that inner loop another loop, but apparently loop discovery
    > > thinks it is just one loop.
    > > Any ideas what I'm doing wrong or is there any way how to make it two loops
    > > (that would also survive all the cfg cleanups until vectorization)?
    >
    > The early CFG looks like we have a common header with two latches
    > so it boils down to how we disambiguate those in the end (we seem
    > to unify the latches via a forwarder).  IIRC OMP lowering builds
    > loops itself, could it not do the appropriate disambiguation itself?
    
    I realized I emit the same stmts on both paths (before goto doit; and before
    falling through it), at least the MIN_EXPR and PLUS_EXPR, so by forcing
    there an extra bb which does those two and having the "doit" label before
    that the innermost loop doesn't have multiple latches anymore and so is
    vectorized fine.
    
    2020-10-07  Jakub Jelinek  <jakub@redhat.com>
    
            * omp-expand.c (expand_omp_simd): Don't emit MIN_EXPR and PLUS_EXPR
            at the end of entry_bb and innermost init_bb, instead force arguments
            for MIN_EXPR into temporaries in both cases and jump to a new bb that
            performs MIN_EXPR and PLUS_EXPR.
    
            * gcc.dg/gomp/simd-2.c: New test.
            * gcc.dg/gomp/simd-3.c: New test.

Diff:
---
 gcc/omp-expand.c                   | 19 +++++++++++---
 gcc/testsuite/gcc.dg/gomp/simd-2.c | 51 ++++++++++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/gomp/simd-3.c | 51 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 99cb4f9dda4..0d3008994e8 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -6347,6 +6347,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   tree n2var = NULL_TREE;
   tree n2v = NULL_TREE;
   tree *nonrect_bounds = NULL;
+  tree min_arg1 = NULL_TREE, min_arg2 = NULL_TREE;
   if (fd->collapse > 1)
     {
       if (broken_loop || gimple_omp_for_combined_into_p (fd->for_stmt))
@@ -6406,9 +6407,10 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
 			     fold_convert (itype, fd->loops[i].step));
 	  t = fold_convert (type, t);
 	  tree t2 = fold_build2 (MINUS_EXPR, type, n2, n1);
-	  t = fold_build2 (MIN_EXPR, type, t2, t);
-	  t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t);
-	  expand_omp_build_assign (&gsi, n2var, t);
+	  min_arg1 = create_tmp_var (type);
+	  expand_omp_build_assign (&gsi, min_arg1, t2);
+	  min_arg2 = create_tmp_var (type);
+	  expand_omp_build_assign (&gsi, min_arg2, t);
 	}
       else
 	{
@@ -6815,7 +6817,16 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
 		}
 	      else
 		t = counts[i + 1];
-	      t = fold_build2 (MIN_EXPR, type, t2, t);
+	      expand_omp_build_assign (&gsi, min_arg1, t2);
+	      expand_omp_build_assign (&gsi, min_arg2, t);
+	      e = split_block (init_bb, last_stmt (init_bb));
+	      gsi = gsi_after_labels (e->dest);
+	      init_bb = e->dest;
+	      remove_edge (FALLTHRU_EDGE (entry_bb));
+	      make_edge (entry_bb, init_bb, EDGE_FALLTHRU);
+	      set_immediate_dominator (CDI_DOMINATORS, init_bb, entry_bb);
+	      set_immediate_dominator (CDI_DOMINATORS, l1_bb, init_bb);
+	      t = fold_build2 (MIN_EXPR, type, min_arg1, min_arg2);
 	      t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t);
 	      expand_omp_build_assign (&gsi, n2var, t);
 	    }
diff --git a/gcc/testsuite/gcc.dg/gomp/simd-2.c b/gcc/testsuite/gcc.dg/gomp/simd-2.c
new file mode 100644
index 00000000000..7ac3eb4444a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/simd-2.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -fdump-tree-vect-details" } */
+/* { dg-additional-options "-mavx" { target avx } } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-9]\[0-9]* loops in function" 5 "vect" } } */
+
+int a[10000][128];
+
+void
+foo (void)
+{
+  #pragma omp for simd schedule (simd: dynamic, 32) collapse(2)
+  for (int i = 0; i < 10000; i++)
+    for (int j = 0; j < 128; j++)
+      a[i][j] += 3;
+}
+
+void
+bar (void)
+{
+  #pragma omp parallel for simd schedule (simd: dynamic, 32) collapse(2)
+  for (int i = 0; i < 10000; i++)
+    for (int j = 0; j < 128; j++)
+      a[i][j] += 3;
+}
+
+void
+baz (void)
+{
+  #pragma omp distribute parallel for simd schedule (simd: dynamic, 32) collapse(2)
+  for (int i = 0; i < 10000; i++)
+    for (int j = 0; j < 128; j++)
+      a[i][j] += 3;
+}
+
+void
+qux (void)
+{
+  #pragma omp distribute simd dist_schedule (static, 128) collapse(2)
+  for (int i = 0; i < 10000; i++)
+    for (int j = 0; j < 128; j++)
+      a[i][j] += 3;
+}
+
+void
+corge (void)
+{
+  #pragma omp taskloop simd collapse(2)
+  for (int i = 0; i < 10000; i++)
+    for (int j = 0; j < 128; j++)
+      a[i][j] += 3;
+}
diff --git a/gcc/testsuite/gcc.dg/gomp/simd-3.c b/gcc/testsuite/gcc.dg/gomp/simd-3.c
new file mode 100644
index 00000000000..13e1346da03
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/simd-3.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -fdump-tree-vect-details" } */
+/* { dg-additional-options "-mavx" { target avx } } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-9]\[0-9]* loops in function" 5 "vect" } } */
+
+int a[1024][1024];
+
+void
+foo (void)
+{
+  #pragma omp for simd collapse(2)
+  for (int i = 0; i < 1024; i++)
+    for (int j = 0; j < i; j++)
+      a[i][j] += 3;
+}
+
+void
+bar (void)
+{
+  #pragma omp parallel for simd collapse(2)
+  for (int i = 0; i < 1024; i++)
+    for (int j = 0; j < i; j++)
+      a[i][j] += 3;
+}
+
+void
+baz (void)
+{
+  #pragma omp distribute parallel for simd collapse(2)
+  for (int i = 0; i < 1024; i++)
+    for (int j = 0; j < i; j++)
+      a[i][j] += 3;
+}
+
+void
+qux (void)
+{
+  #pragma omp distribute simd collapse(2)
+  for (int i = 0; i < 1024; i++)
+    for (int j = 0; j < i; j++)
+      a[i][j] += 3;
+}
+
+void
+corge (void)
+{
+  #pragma omp taskloop simd collapse(2)
+  for (int i = 0; i < 1024; i++)
+    for (int j = 0; j < i; j++)
+      a[i][j] += 3;
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2020-10-07  8:51 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-07  8:51 [gcc r11-3691] openmp: Improve composite simd vectorization Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).