public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [patch] [autovect] bugfix in outer-loop vectorization
@ 2007-08-01  7:22 Dorit Nuzman
  0 siblings, 0 replies; only message in thread
From: Dorit Nuzman @ 2007-08-01  7:22 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1516 bytes --]


We can't allow vectorization of widneing reduction patterns like DOT_PROD
and WIDEN_SUM when their results are used in the loop. This is because
these idioms compute N widened results and then reduce them into N/2
results by summing-up pairs of results. So if we need to use (say store)
the original N results we cannot use this idiom. (In innermost-loop
vectorization this cannot happen because a reduction cannot be used in the
loop. However, in outer-loop vectorization, a reduction in the inner-loop
can be used in the outer-loop).

Bootstrapped with vectorization enabled and tested on the vectorizer
testcases on powerpc64-linux. Committed to autovect branch.

dorit

        * tree-vect-analyze.c (vect_mark_relevant): Don't mark widening
        reduction patterns as relevant if they are used in the outer-loop
        (we want to vectorize the original sequence instead).
        (vect_mark_stmts_to_be_vectorized): Add an assert.
        * tree-vect-transform.c (vect_finalize_reduction): Set a stmt_info.

        * gcc.dg/vect/vect-outer-4e.c: New test.
        * gcc.dg/vect/vect-outer-4f.c: New test.
        * gcc.dg/vect/vect-outer-4g.c: New test.
        * gcc.dg/vect/no-section-anchors-vect-outer-4h.c: New test.
        * gcc.dg/vect/vect-outer-4i.c: New test.
        * gcc.dg/vect/vect-outer-4j.c: New test.
        * gcc.dg/vect/vect-outer-4k.c: New test.
        * gcc.dg/vect/vect-outer-4l.c: New test.
        * gcc.dg/vect/vect-outer-4m.c: New test.

(See attached file: autovectfix.txt)

[-- Attachment #2: autovectfix.txt --]
[-- Type: text/plain, Size: 14654 bytes --]

Index: testsuite/gcc.dg/vect/vect-outer-4g.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4g.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4g.c	(revision 0)
@@ -0,0 +1,54 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop.  */
+
+unsigned int
+foo (){
+  int i,j;
+  unsigned int diff;
+  unsigned int s=0;
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s+=diff;
+  }
+  return s;
+}
+
+int main (void)
+{
+  int i, j;
+  unsigned int diff;
+  unsigned int s,sum=0;
+
+  check_vect ();
+
+  sum=foo ();
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s += diff;
+  }
+
+  if (s != sum)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4i.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4i.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4i.c	(revision 0)
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+
+#define N 40
+#define M 128
+unsigned char in[N+M];
+unsigned short out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop.  */
+
+unsigned short
+foo (){
+  int i,j;
+  unsigned short diff;
+  unsigned short s=0;
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s+=diff;
+  }
+  return s;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4j.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4j.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4j.c	(revision 0)
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+
+#define N 40
+#define M 128
+unsigned char in[N+M];
+unsigned short out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop.  */
+
+void
+foo (){
+  int i,j;
+  unsigned short diff;
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    out[i]=diff;
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c
===================================================================
--- testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c	(revision 0)
+++ testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c	(revision 0)
@@ -0,0 +1,47 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+
+#define N 40
+#define M 128
+unsigned short a[M][N];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+
+void
+foo (){
+  int i,j;
+  unsigned int diff;
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j++) {
+      a[j][i] = 4;
+    }
+    out[i]=5;
+  }
+}
+
+int main (void)
+{
+  int i, j;
+  check_vect ();
+
+  foo ();
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j++) {
+      if (a[j][i] != 4)
+        abort ();
+    }
+    if (out[i] != 5)
+      abort ();
+  }
+
+  return 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4k.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4k.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4k.c	(revision 0)
@@ -0,0 +1,54 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop.  */
+
+unsigned int
+foo (){
+  int i,j;
+  unsigned int diff;
+  unsigned int s=0;
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s+=(diff>>3);
+  }
+  return s;
+}
+
+int main (void)
+{
+  int i, j;
+  unsigned int diff;
+  unsigned int s,sum=0;
+
+  check_vect ();
+
+  sum=foo ();
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s += (diff>>3);
+  }
+
+  if (s != sum)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* }  } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4l.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4l.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4l.c	(revision 0)
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+unsigned char arr[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop.  */
+
+unsigned int
+foo (){
+  int i,j;
+  unsigned int diff;
+  unsigned int s=0;
+
+  for (i = 0; i < N; i++) {
+    arr[i] = 3;
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s+=diff;
+  }
+  return s;
+}
+
+int main (void)
+{
+  int i, j;
+  unsigned int diff;
+  unsigned int s,sum=0;
+
+  check_vect ();
+
+  sum=foo ();
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s += diff;
+  }
+
+  if (s != sum)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4e.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4e.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4e.c	(revision 0)
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+
+#define N 40
+#define M 128
+unsigned int in[N+M];
+unsigned short out[N];
+
+/* Outer-loop vectorization. */
+
+void
+foo (){
+  int i,j;
+  unsigned int diff;
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    out[i]=(unsigned short)diff;
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4m.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4m.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4m.c	(revision 0)
@@ -0,0 +1,54 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop.  */
+
+unsigned int
+foo (){
+  int i,j;
+  unsigned int diff;
+  unsigned int s=0;
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s+=((unsigned short)diff>>3);
+  }
+  return s;
+}
+
+int main (void)
+{
+  int i, j;
+  unsigned int diff;
+  unsigned int s,sum=0;
+
+  check_vect ();
+
+  sum=foo ();
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    s += (diff>>3);
+  }
+
+  if (s != sum)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect"  { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/vect-outer-4f.c
===================================================================
--- testsuite/gcc.dg/vect/vect-outer-4f.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-outer-4f.c	(revision 0)
@@ -0,0 +1,49 @@
+/* { dg-require-effective-target vect_int } */
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 40
+#define M 128
+unsigned short in[N+M];
+unsigned int out[N];
+
+/* Outer-loop vectorization. */
+/* Not vectorized due to multiple-types in the inner-loop.  */
+
+void
+foo (){
+  int i,j;
+  unsigned int diff;
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    out[i]=diff;
+  }
+}
+
+int main (void)
+{
+  int i, j;
+  unsigned int diff;
+
+  check_vect ();
+
+  foo ();
+
+  for (i = 0; i < N; i++) {
+    diff = 0;
+    for (j = 0; j < M; j+=8) {
+      diff += in[j+i];
+    }
+    if (out[i] != diff)
+      abort ();
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: tree-vect-analyze.c
===================================================================
--- tree-vect-analyze.c	(revision 127086)
+++ tree-vect-analyze.c	(working copy)
@@ -588,14 +588,14 @@
           return false;
         }
       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_NITERS (loop_vinfo)
-          && TREE_CODE (LOOP_VINFO_NITERS (loop_vinfo)) == COND_EXPR)
-        {
-          if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
-            fprintf (vect_dump,
-                    "not vectorized: can't create epilog loop 2.");
-          return false;
-        }
+	  && LOOP_VINFO_NITERS (loop_vinfo)
+	  && TREE_CODE (LOOP_VINFO_NITERS (loop_vinfo)) == COND_EXPR)
+	{
+	  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+	    fprintf (vect_dump,
+		     "not vectorized: can't create epilog loop 2.");
+	    return false;
+	}
       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
         {
           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
@@ -3039,16 +3039,43 @@
 
       /* This is the last stmt in a sequence that was detected as a 
          pattern that can potentially be vectorized.  Don't mark the stmt
-         as relevant/live because it's not going to vectorized.
+         as relevant/live because it's not going to be vectorized.
          Instead mark the pattern-stmt that replaces it.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
+
       pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
-      stmt_info = vinfo_for_stmt (pattern_stmt);
-      gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
-      save_relevant = STMT_VINFO_RELEVANT (stmt_info);
-      save_live_p = STMT_VINFO_LIVE_P (stmt_info);
-      stmt = pattern_stmt;
+
+      /* One exception to the above is when the pattern-stmt is an
+	 "unordered reduction" operation, whose results are used in the
+	 outer-loop, in which case the order of the generated 
+	 results is important, and therefore we can't vectorize the pattern. 
+
+	 An "unordered reduction" is a reduction that is vectorized without 
+	 preserving all the intermediate results, like widen_sum and dot_prod, 
+	 that produce only N/2 results (by summing up pairs of intermediate 
+	 results). If these results are actually used (e.g., stored, in an 
+	 outer-loop), we need to have all N results (and in the right order). 
+	 Therefore, in such a case, we cannot vectorize the reduction pattern,
+	 and need to resort to vectorizing the original stmts.  */
+      if ((TREE_CODE (GIMPLE_STMT_OPERAND (pattern_stmt, 1)) == WIDEN_SUM_EXPR
+	   || TREE_CODE (GIMPLE_STMT_OPERAND (pattern_stmt,1)) == DOT_PROD_EXPR)
+	  && (relevant == vect_used_in_outer 
+	      || relevant == vect_used_in_outer_by_reduction))
+        {
+	  if (vect_print_dump_info (REPORT_DETAILS))
+	    fprintf (vect_dump, "skip unordered reduction pattern.");
+	  STMT_VINFO_RELATED_STMT (stmt_info) = NULL_TREE;
+	  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
+	}
+      else
+	{
+	  if (vect_print_dump_info (REPORT_DETAILS))
+	    fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
+	  stmt_info = vinfo_for_stmt (pattern_stmt);
+	  gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
+	  save_relevant = STMT_VINFO_RELEVANT (stmt_info);
+	  save_live_p = STMT_VINFO_LIVE_P (stmt_info);
+	  stmt = pattern_stmt;
+	}
     }
 
   STMT_VINFO_LIVE_P (stmt_info) |= live_p;
@@ -3391,12 +3418,11 @@
 	 Reduction phis are expected to be used by a reduction stmt, or by
 	 in an outer loop;  Other reduction stmts are expected to be
 	 in the loop, and possibly used by a stmt in an outer loop. 
-	are the expected values of "relevant" for reduction phis/stmts in
-	op:
+	 Here are the expected values of "relevant" for reduction phis/stmts:
 
 	 relevance:				phi	stmt
 	 vect_unused_in_loop				ok
-	 vect_used_in_outer_by_reductio		ok	ok
+	 vect_used_in_outer_by_reduction	ok	ok
 	 vect_used_in_outer			ok	ok
 	 vect_used_by_reduction			ok
 	 vect_used_in_loop 						  */
@@ -3413,6 +3439,8 @@
 
 	    case vect_used_in_outer_by_reduction:
 	    case vect_used_in_outer:
+	      gcc_assert (TREE_CODE (stmt) != WIDEN_SUM_EXPR
+			  && TREE_CODE (stmt) != DOT_PROD_EXPR);
 	      break;
 
 	    case vect_used_by_reduction:
Index: tree-vect-transform.c
===================================================================
--- tree-vect-transform.c	(revision 127086)
+++ tree-vect-transform.c	(working copy)
@@ -1956,7 +1956,6 @@
   vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
   gcc_assert (vec_stmt_for_operand);
   vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
-
   return vec_oprnd;
 }
 
@@ -2499,6 +2498,7 @@
     }
   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
   gcc_assert (exit_phi);
+
   if (nested_in_vect_loop)
     {
       stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
@@ -2510,6 +2510,9 @@
 
       epilog_stmt = adjustment_def ? epilog_stmt :  new_phi;
       STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
+      set_stmt_info (get_stmt_ann (epilog_stmt),
+                     new_stmt_vec_info (epilog_stmt, loop_vinfo));
+
       if (vect_print_dump_info (REPORT_DETAILS))
         {
           fprintf (vect_dump, "vector of partial results after inner-loop:");

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2007-08-01  7:22 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-08-01  7:22 [patch] [autovect] bugfix in outer-loop vectorization Dorit Nuzman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).