public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
@ 2013-11-23 14:13 Cong Hou
  2013-11-27 11:37 ` Richard Biener
  0 siblings, 1 reply; 15+ messages in thread
From: Cong Hou @ 2013-11-23 14:13 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Biener

[-- Attachment #1: Type: text/plain, Size: 9240 bytes --]

Hi

Currently in GCC vectorization, some loop invariant may be detected
after aliasing checks, which can be hoisted outside of the loop. The
current method in GCC may break the information built during the
analysis phase, causing some crash (see PR59006 and PR58921).

This patch improves the loop invariant hoisting by delaying it until
all statements are vectorized, thereby keeping all built information.
But those loop invariant statements won't be vectorized, and if a
variable is defined by one of those loop invariant, it is treated as
an external definition.

Bootstrapped and testes on an x86-64 machine.


thanks,
Cong



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2c0554b..0614bab 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,18 @@
+2013-11-22  Cong Hou  <congh@google.com>
+
+ PR tree-optimization/58921
+ PR tree-optimization/59006
+ * tree-vectorizer.h (struct _stmt_vec_info): New data member
+ loop_invariant.
+ * tree-vect-loop-manip.c (vect_loop_versioning): Delay hoisting loop
+ invariants until all statements are vectorized.
+ * tree-vect-loop.c (vect_hoist_loop_invariants): New functions.
+ (vect_transform_loop): Hoist loop invariants after all statements
+ are vectorized.  Do not vectorize loop invariants stmts.
+ * tree-vect-stmts.c (vect_get_vec_def_for_operand): Treat a loop
+ invariant as an external definition.
+ (new_stmt_vec_info): Initialize new data member.
+
 2013-11-12  Jeff Law  <law@redhat.com>

  * tree-ssa-threadedge.c (thread_around_empty_blocks): New
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 09c7f20..447625b 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2013-11-22  Cong Hou  <congh@google.com>
+
+ PR tree-optimization/58921
+ PR tree-optimization/59006
+ * gcc.dg/vect/pr58921.c: New test.
+ * gcc.dg/vect/pr59006.c: New test.
+
 2013-11-12  Balaji V. Iyer  <balaji.v.iyer@intel.com>

  * gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
diff --git a/gcc/testsuite/gcc.dg/vect/pr58921.c
b/gcc/testsuite/gcc.dg/vect/pr58921.c
new file mode 100644
index 0000000..ee3694a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr58921.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+int a[7];
+int b;
+
+void
+fn1 ()
+{
+  for (; b; b++)
+    a[b] = ((a[b] <= 0) == (a[0] != 0));
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr59006.c
b/gcc/testsuite/gcc.dg/vect/pr59006.c
new file mode 100644
index 0000000..95d90a9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr59006.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+int a[8], b;
+
+void fn1 (void)
+{
+  int c;
+  for (; b; b++)
+    {
+      int d = a[b];
+      c = a[0] ? d : 0;
+      a[b] = c;
+    }
+}
+
+void fn2 ()
+{
+  for (; b <= 0; b++)
+    a[b] = a[0] || b;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 15227856..3adc73d 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2448,8 +2448,12 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
     {
       gimple def = SSA_NAME_DEF_STMT (var);
+      stmt_vec_info def_stmt_info;
+
       if (!gimple_nop_p (def)
-  && flow_bb_inside_loop_p (loop, gimple_bb (def)))
+  && flow_bb_inside_loop_p (loop, gimple_bb (def))
+  && !((def_stmt_info = vinfo_for_stmt (def))
+ && STMT_VINFO_LOOP_INVARIANT_P (def_stmt_info)))
  {
   hoist = false;
   break;
@@ -2458,21 +2462,8 @@ vect_loop_versioning (loop_vec_info loop_vinfo,

   if (hoist)
     {
-      if (dr)
- gimple_set_vuse (stmt, NULL);
-
-      gsi_remove (&si, false);
-      gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
-    stmt);
-
-      if (dump_enabled_p ())
- {
-  dump_printf_loc
-      (MSG_NOTE, vect_location,
-       "hoisting out of the vectorized loop: ");
-  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
-  dump_printf (MSG_NOTE, "\n");
- }
+      STMT_VINFO_LOOP_INVARIANT_P (stmt_info) = true;
+      gsi_next (&si);
       continue;
     }
  }
@@ -2481,6 +2472,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
  }
     }

+
   /* End loop-exit-fixes after versioning.  */

   if (cond_expr_stmt_list)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 292e771..148f9f1 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5572,6 +5572,49 @@ vect_loop_kill_debug_uses (struct loop *loop,
gimple stmt)
     }
 }

+/* Find all loop invariants detected after alias checks, and hoist them
+   before the loop preheader.  */
+
+static void
+vect_hoist_loop_invariants (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
+  gimple_seq loop_invariants = NULL;
+
+  for (int i = 0; i < (int)loop->num_nodes; i++)
+    {
+      basic_block bb = bbs[i];
+      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
+ {
+  gimple stmt = gsi_stmt (si);
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+  if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
+    {
+      if (gimple_has_mem_ops (stmt))
+ gimple_set_vuse (stmt, NULL);
+
+      gsi_remove (&si, false);
+      gimple_seq_add_stmt (&loop_invariants, stmt);
+
+      if (dump_enabled_p ())
+ {
+  dump_printf_loc
+    (MSG_NOTE, vect_location,
+     "hoisting out of the vectorized loop: ");
+  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+  dump_printf (MSG_NOTE, "\n");
+ }
+    }
+  else
+    gsi_next (&si);
+ }
+    }
+  basic_block pre_header = loop_preheader_edge (loop)->src;
+  gcc_assert (EDGE_COUNT (pre_header->preds) == 1);
+  gsi_insert_seq_on_edge_immediate (EDGE_PRED (pre_header, 0),
loop_invariants);
+}
+
 /* Function vect_transform_loop.

    The analysis phase has determined that the loop is vectorizable.
@@ -5835,6 +5878,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
  transform_pattern_stmt = false;
             }

+          /* If stmt is a loop invariant (detected after alias checks),
+             do not generate the vectorized stmt for it as it will be
+             hoisted later.  */
+  if (STMT_VINFO_LOOP_INVARIANT_P (stmt_info))
+    {
+      gsi_next (&si);
+      continue;
+    }
+
   gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
                                                STMT_VINFO_VECTYPE (stmt_info));
@@ -5910,6 +5962,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
  }        /* stmts in BB */
     } /* BBs in loop */

+  /* Hoist all loop invariants.  */
+  vect_hoist_loop_invariants (loop_vinfo);
+
   slpeel_make_loop_iterate_ntimes (loop, ratio);

   /* Reduce loop iterations by the vectorization factor.  */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index b0e0fa9..3e15372 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1362,6 +1362,18 @@ vect_get_vec_def_for_operand (tree op, gimple
stmt, tree *scalar_def)
         }
     }

+  /* After alias checks, some loop invariants may be detected, and we won't
+     generate vectorized stmts for them.  We only hoist them after all stmts
+     are vectorized.  Here if we meet a loop invariant, we need to assume it
+     is already hoisted before the loop.  We do this by setting the def-type
+     to vect_external_def.  */
+  if (def_stmt && dt == vect_internal_def)
+    {
+      stmt_vec_info stmt_vinfo = vinfo_for_stmt (def_stmt);
+      if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
+ dt = vect_external_def;
+    }
+
   switch (dt)
     {
     /* Case 1: operand is a constant.  */
@@ -6083,6 +6095,7 @@ new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo,
   STMT_VINFO_BB_VINFO (res) = bb_vinfo;
   STMT_VINFO_RELEVANT (res) = vect_unused_in_scope;
   STMT_VINFO_LIVE_P (res) = false;
+  STMT_VINFO_LOOP_INVARIANT_P (res) = false;
   STMT_VINFO_VECTYPE (res) = NULL;
   STMT_VINFO_VEC_STMT (res) = NULL;
   STMT_VINFO_VECTORIZABLE (res) = true;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bbd50e1..2c230f9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -516,6 +516,10 @@ typedef struct _stmt_vec_info {
      used outside the loop.  */
   bool live;

+  /* Indicates whether this stmt is a loop invariant, which can be hoisted.
+     A stmt may become loop invariant after alias checks.  */
+  bool loop_invariant;
+
   /* Stmt is part of some pattern (computation idiom)  */
   bool in_pattern_p;

@@ -623,6 +627,7 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_BB_VINFO(S)             (S)->bb_vinfo
 #define STMT_VINFO_RELEVANT(S)             (S)->relevant
 #define STMT_VINFO_LIVE_P(S)               (S)->live
+#define STMT_VINFO_LOOP_INVARIANT_P(S)     (S)->loop_invariant
 #define STMT_VINFO_VECTYPE(S)              (S)->vectype
 #define STMT_VINFO_VEC_STMT(S)             (S)->vectorized_stmt
 #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable

[-- Attachment #2: patch-fix-hoist.txt --]
[-- Type: text/plain, Size: 8736 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2c0554b..0614bab 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,18 @@
+2013-11-22  Cong Hou  <congh@google.com>
+
+	PR tree-optimization/58921
+	PR tree-optimization/59006
+	* tree-vectorizer.h (struct _stmt_vec_info): New data member
+	loop_invariant.
+	* tree-vect-loop-manip.c (vect_loop_versioning): Delay hoisting loop
+	invariants until all statements are vectorized.
+	* tree-vect-loop.c (vect_hoist_loop_invariants): New functions.
+	(vect_transform_loop): Hoist loop invariants after all statements
+	are vectorized.  Do not vectorize loop invariants stmts.
+	* tree-vect-stmts.c (vect_get_vec_def_for_operand): Treat a loop
+	invariant as an external definition.
+	(new_stmt_vec_info): Initialize new data member.
+
 2013-11-12  Jeff Law  <law@redhat.com>
 
 	* tree-ssa-threadedge.c (thread_around_empty_blocks): New
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 09c7f20..447625b 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2013-11-22  Cong Hou  <congh@google.com>
+
+	PR tree-optimization/58921
+	PR tree-optimization/59006
+	* gcc.dg/vect/pr58921.c: New test.
+	* gcc.dg/vect/pr59006.c: New test.
+
 2013-11-12  Balaji V. Iyer  <balaji.v.iyer@intel.com>
 
 	* gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
diff --git a/gcc/testsuite/gcc.dg/vect/pr58921.c b/gcc/testsuite/gcc.dg/vect/pr58921.c
new file mode 100644
index 0000000..ee3694a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr58921.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+int a[7];
+int b;
+
+void
+fn1 ()
+{
+  for (; b; b++)
+    a[b] = ((a[b] <= 0) == (a[0] != 0));
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr59006.c b/gcc/testsuite/gcc.dg/vect/pr59006.c
new file mode 100644
index 0000000..95d90a9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr59006.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+int a[8], b;
+
+void fn1 (void)
+{
+  int c;
+  for (; b; b++)
+    {
+      int d = a[b];
+      c = a[0] ? d : 0;
+      a[b] = c;
+    }
+}
+
+void fn2 ()
+{
+  for (; b <= 0; b++)
+    a[b] = a[0] || b;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 15227856..3adc73d 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2448,8 +2448,12 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
 		  FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
 		    {
 		      gimple def = SSA_NAME_DEF_STMT (var);
+		      stmt_vec_info def_stmt_info;
+
 		      if (!gimple_nop_p (def)
-			  && flow_bb_inside_loop_p (loop, gimple_bb (def)))
+			  && flow_bb_inside_loop_p (loop, gimple_bb (def))
+			  && !((def_stmt_info = vinfo_for_stmt (def))
+				&& STMT_VINFO_LOOP_INVARIANT_P (def_stmt_info)))
 			{
 			  hoist = false;
 			  break;
@@ -2458,21 +2462,8 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
 
 		  if (hoist)
 		    {
-		      if (dr)
-			gimple_set_vuse (stmt, NULL);
-
-		      gsi_remove (&si, false);
-		      gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
-						    stmt);
-
-		      if (dump_enabled_p ())
-			{
-			  dump_printf_loc
-			      (MSG_NOTE, vect_location,
-			       "hoisting out of the vectorized loop: ");
-			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
-			  dump_printf (MSG_NOTE, "\n");
-			}
+		      STMT_VINFO_LOOP_INVARIANT_P (stmt_info) = true;
+		      gsi_next (&si);
 		      continue;
 		    }
 		}
@@ -2481,6 +2472,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
 	}
     }
 
+
   /* End loop-exit-fixes after versioning.  */
 
   if (cond_expr_stmt_list)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 292e771..148f9f1 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5572,6 +5572,49 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
     }
 }
 
+/* Find all loop invariants detected after alias checks, and hoist them
+   before the loop preheader.  */
+
+static void
+vect_hoist_loop_invariants (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
+  gimple_seq loop_invariants = NULL;
+
+  for (int i = 0; i < (int)loop->num_nodes; i++)
+    {
+      basic_block bb = bbs[i];
+      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
+	{
+	  gimple stmt = gsi_stmt (si);
+	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+	  if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
+	    {
+	      if (gimple_has_mem_ops (stmt))
+		gimple_set_vuse (stmt, NULL);
+
+	      gsi_remove (&si, false);
+	      gimple_seq_add_stmt (&loop_invariants, stmt);
+
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc
+		    (MSG_NOTE, vect_location,
+		     "hoisting out of the vectorized loop: ");
+		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+		  dump_printf (MSG_NOTE, "\n");
+		}
+	    }
+	  else
+	    gsi_next (&si);
+	}
+    }
+  basic_block pre_header = loop_preheader_edge (loop)->src;
+  gcc_assert (EDGE_COUNT (pre_header->preds) == 1);
+  gsi_insert_seq_on_edge_immediate (EDGE_PRED (pre_header, 0), loop_invariants);
+}
+
 /* Function vect_transform_loop.
 
    The analysis phase has determined that the loop is vectorizable.
@@ -5835,6 +5878,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 		transform_pattern_stmt = false;
             }
 
+          /* If stmt is a loop invariant (detected after alias checks),
+             do not generate the vectorized stmt for it as it will be
+             hoisted later.  */
+	  if (STMT_VINFO_LOOP_INVARIANT_P (stmt_info))
+	    {
+	      gsi_next (&si);
+	      continue;
+	    }
+
 	  gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
 	  nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
                                                STMT_VINFO_VECTYPE (stmt_info));
@@ -5910,6 +5962,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	}		        /* stmts in BB */
     }				/* BBs in loop */
 
+  /* Hoist all loop invariants.  */
+  vect_hoist_loop_invariants (loop_vinfo);
+
   slpeel_make_loop_iterate_ntimes (loop, ratio);
 
   /* Reduce loop iterations by the vectorization factor.  */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index b0e0fa9..3e15372 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1362,6 +1362,18 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
         }
     }
 
+  /* After alias checks, some loop invariants may be detected, and we won't
+     generate vectorized stmts for them.  We only hoist them after all stmts
+     are vectorized.  Here if we meet a loop invariant, we need to assume it
+     is already hoisted before the loop.  We do this by setting the def-type
+     to vect_external_def.  */
+  if (def_stmt && dt == vect_internal_def)
+    {
+      stmt_vec_info stmt_vinfo = vinfo_for_stmt (def_stmt);
+      if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
+	dt = vect_external_def;
+    }
+
   switch (dt)
     {
     /* Case 1: operand is a constant.  */
@@ -6083,6 +6095,7 @@ new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo,
   STMT_VINFO_BB_VINFO (res) = bb_vinfo;
   STMT_VINFO_RELEVANT (res) = vect_unused_in_scope;
   STMT_VINFO_LIVE_P (res) = false;
+  STMT_VINFO_LOOP_INVARIANT_P (res) = false;
   STMT_VINFO_VECTYPE (res) = NULL;
   STMT_VINFO_VEC_STMT (res) = NULL;
   STMT_VINFO_VECTORIZABLE (res) = true;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bbd50e1..2c230f9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -516,6 +516,10 @@ typedef struct _stmt_vec_info {
      used outside the loop.  */
   bool live;
 
+  /* Indicates whether this stmt is a loop invariant, which can be hoisted.
+     A stmt may become loop invariant after alias checks.  */
+  bool loop_invariant;
+
   /* Stmt is part of some pattern (computation idiom)  */
   bool in_pattern_p;
 
@@ -623,6 +627,7 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_BB_VINFO(S)             (S)->bb_vinfo
 #define STMT_VINFO_RELEVANT(S)             (S)->relevant
 #define STMT_VINFO_LIVE_P(S)               (S)->live
+#define STMT_VINFO_LOOP_INVARIANT_P(S)     (S)->loop_invariant
 #define STMT_VINFO_VECTYPE(S)              (S)->vectype
 #define STMT_VINFO_VEC_STMT(S)             (S)->vectorized_stmt
 #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2013-11-23 14:13 [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer Cong Hou
@ 2013-11-27 11:37 ` Richard Biener
  2013-11-27 12:45   ` Jakub Jelinek
  2013-11-28  4:39   ` Cong Hou
  0 siblings, 2 replies; 15+ messages in thread
From: Richard Biener @ 2013-11-27 11:37 UTC (permalink / raw)
  To: Cong Hou; +Cc: GCC Patches

On Fri, 22 Nov 2013, Cong Hou wrote:

> Hi
> 
> Currently in GCC vectorization, some loop invariant may be detected
> after aliasing checks, which can be hoisted outside of the loop. The
> current method in GCC may break the information built during the
> analysis phase, causing some crash (see PR59006 and PR58921).
> 
> This patch improves the loop invariant hoisting by delaying it until
> all statements are vectorized, thereby keeping all built information.
> But those loop invariant statements won't be vectorized, and if a
> variable is defined by one of those loop invariant, it is treated as
> an external definition.
> 
> Bootstrapped and testes on an x86-64 machine.

Hmm.  I'm still thinking that we should handle this during the regular
transform step.

Like with the following incomplete patch.  Missing is adjusting
the rest of the vectorizable_* functions to handle the case where all defs
are dt_external or constant by setting their own STMT_VINFO_DEF_TYPE to
dt_external.  From the gcc.dg/vect/pr58508.c we get only 4 hoists
instead of 8 because of this (I think).

Also gcc.dg/vect/pr52298.c ICEs for yet unanalyzed reason.

I can take over the bug if you like.

Thanks,
Richard.

Index: gcc/tree-vect-data-refs.c
===================================================================
*** gcc/tree-vect-data-refs.c	(revision 205435)
--- gcc/tree-vect-data-refs.c	(working copy)
*************** again:
*** 3668,3673 ****
--- 3668,3682 ----
  	    }
  	  STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
  	}
+       else if (loop_vinfo
+ 	       && integer_zerop (DR_STEP (dr)))
+ 	{
+ 	  /* All loads from a non-varying address will be disambiguated
+ 	     by data-ref analysis or via a runtime alias check and thus
+ 	     they will become invariant.  Force them to be vectorized
+ 	     as external.  */
+ 	  STMT_VINFO_DEF_TYPE (stmt_info) = vect_external_def;
+ 	}
      }
  
    /* If we stopped analysis at the first dataref we could not analyze
Index: gcc/tree-vect-loop-manip.c
===================================================================
*** gcc/tree-vect-loop-manip.c	(revision 205435)
--- gcc/tree-vect-loop-manip.c	(working copy)
*************** vect_loop_versioning (loop_vec_info loop
*** 2269,2275 ****
  
    /* Extract load statements on memrefs with zero-stride accesses.  */
  
!   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
      {
        /* In the loop body, we iterate each statement to check if it is a load.
  	 Then we check the DR_STEP of the data reference.  If DR_STEP is zero,
--- 2269,2275 ----
  
    /* Extract load statements on memrefs with zero-stride accesses.  */
  
!   if (0 && LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
      {
        /* In the loop body, we iterate each statement to check if it is a load.
  	 Then we check the DR_STEP of the data reference.  If DR_STEP is zero,
Index: gcc/tree-vect-loop.c
===================================================================
*** gcc/tree-vect-loop.c	(revision 205435)
--- gcc/tree-vect-loop.c	(working copy)
*************** vect_transform_loop (loop_vec_info loop_
*** 5995,6000 ****
--- 5995,6020 ----
  		}
  	    }
  
+ 	  /* If the stmt is loop invariant simply move it.  */
+ 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_external_def)
+ 	    {
+ 	      if (dump_enabled_p ())
+ 		{
+ 		  dump_printf_loc (MSG_NOTE, vect_location,
+ 				   "hoisting out of the vectorized loop: ");
+ 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+ 		  dump_printf (MSG_NOTE, "\n");
+ 		}
+ 	      gsi_remove (&si, false);
+ 	      if (gimple_vuse (stmt))
+ 		gimple_set_vuse (stmt, NULL);
+ 	      basic_block new_bb;
+ 	      new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
+ 						     stmt);
+ 	      gcc_assert (!new_bb);
+ 	      continue;
+ 	    }
+ 
  	  /* -------- vectorize statement ------------ */
  	  if (dump_enabled_p ())
  	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 205435)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_operation (gimple stmt, gim
*** 3497,3502 ****
--- 3497,3503 ----
    vec<tree> vec_oprnds2 = vNULL;
    tree vop0, vop1, vop2;
    bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+   bool all_ops_external = true;
    int vf;
  
    if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
*************** vectorizable_operation (gimple stmt, gim
*** 3557,3562 ****
--- 3558,3565 ----
                           "use not simple.\n");
        return false;
      }
+   if (dt[0] != vect_external_def && dt[0] != vect_constant_def)
+     all_ops_external = false;
    /* If op0 is an external or constant def use a vector type with
       the same size as the output vector type.  */
    if (!vectype)
*************** vectorizable_operation (gimple stmt, gim
*** 3593,3598 ****
--- 3596,3603 ----
                               "use not simple.\n");
  	  return false;
  	}
+       if (dt[1] != vect_external_def && dt[1] != vect_constant_def)
+ 	all_ops_external = false;
      }
    if (op_type == ternary_op)
      {
*************** vectorizable_operation (gimple stmt, gim
*** 3605,3610 ****
--- 3610,3623 ----
                               "use not simple.\n");
  	  return false;
  	}
+       if (dt[2] != vect_external_def && dt[2] != vect_constant_def)
+ 	all_ops_external = false;
+     }
+ 
+   if (all_ops_external && loop_vinfo)
+     {
+       STMT_VINFO_DEF_TYPE (stmt_info) = vect_external_def;
+       return true;
      }
  
    if (loop_vinfo)
*************** vect_analyze_stmt (gimple stmt, bool *ne
*** 5771,5779 ****
                       || relevance == vect_unused_in_scope));
           break;
  
        case vect_induction_def:
        case vect_constant_def:
-       case vect_external_def:
        case vect_unknown_def_type:
        default:
          gcc_unreachable ();
--- 5784,5795 ----
                       || relevance == vect_unused_in_scope));
           break;
  
+       case vect_external_def:
+ 	/* If we decided a stmt is invariant don't bother to vectorize it.  */
+ 	return true;
+ 
        case vect_induction_def:
        case vect_constant_def:
        case vect_unknown_def_type:
        default:
          gcc_unreachable ();


> 
> thanks,
> Cong
> 
> 
> 
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 2c0554b..0614bab 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,18 @@
> +2013-11-22  Cong Hou  <congh@google.com>
> +
> + PR tree-optimization/58921
> + PR tree-optimization/59006
> + * tree-vectorizer.h (struct _stmt_vec_info): New data member
> + loop_invariant.
> + * tree-vect-loop-manip.c (vect_loop_versioning): Delay hoisting loop
> + invariants until all statements are vectorized.
> + * tree-vect-loop.c (vect_hoist_loop_invariants): New functions.
> + (vect_transform_loop): Hoist loop invariants after all statements
> + are vectorized.  Do not vectorize loop invariants stmts.
> + * tree-vect-stmts.c (vect_get_vec_def_for_operand): Treat a loop
> + invariant as an external definition.
> + (new_stmt_vec_info): Initialize new data member.
> +
>  2013-11-12  Jeff Law  <law@redhat.com>
> 
>   * tree-ssa-threadedge.c (thread_around_empty_blocks): New
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 09c7f20..447625b 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,10 @@
> +2013-11-22  Cong Hou  <congh@google.com>
> +
> + PR tree-optimization/58921
> + PR tree-optimization/59006
> + * gcc.dg/vect/pr58921.c: New test.
> + * gcc.dg/vect/pr59006.c: New test.
> +
>  2013-11-12  Balaji V. Iyer  <balaji.v.iyer@intel.com>
> 
>   * gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
> diff --git a/gcc/testsuite/gcc.dg/vect/pr58921.c
> b/gcc/testsuite/gcc.dg/vect/pr58921.c
> new file mode 100644
> index 0000000..ee3694a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr58921.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +
> +int a[7];
> +int b;
> +
> +void
> +fn1 ()
> +{
> +  for (; b; b++)
> +    a[b] = ((a[b] <= 0) == (a[0] != 0));
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/pr59006.c
> b/gcc/testsuite/gcc.dg/vect/pr59006.c
> new file mode 100644
> index 0000000..95d90a9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr59006.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +
> +int a[8], b;
> +
> +void fn1 (void)
> +{
> +  int c;
> +  for (; b; b++)
> +    {
> +      int d = a[b];
> +      c = a[0] ? d : 0;
> +      a[b] = c;
> +    }
> +}
> +
> +void fn2 ()
> +{
> +  for (; b <= 0; b++)
> +    a[b] = a[0] || b;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 15227856..3adc73d 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -2448,8 +2448,12 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
>    FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
>      {
>        gimple def = SSA_NAME_DEF_STMT (var);
> +      stmt_vec_info def_stmt_info;
> +
>        if (!gimple_nop_p (def)
> -  && flow_bb_inside_loop_p (loop, gimple_bb (def)))
> +  && flow_bb_inside_loop_p (loop, gimple_bb (def))
> +  && !((def_stmt_info = vinfo_for_stmt (def))
> + && STMT_VINFO_LOOP_INVARIANT_P (def_stmt_info)))
>   {
>    hoist = false;
>    break;
> @@ -2458,21 +2462,8 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
> 
>    if (hoist)
>      {
> -      if (dr)
> - gimple_set_vuse (stmt, NULL);
> -
> -      gsi_remove (&si, false);
> -      gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
> -    stmt);
> -
> -      if (dump_enabled_p ())
> - {
> -  dump_printf_loc
> -      (MSG_NOTE, vect_location,
> -       "hoisting out of the vectorized loop: ");
> -  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
> -  dump_printf (MSG_NOTE, "\n");
> - }
> +      STMT_VINFO_LOOP_INVARIANT_P (stmt_info) = true;
> +      gsi_next (&si);
>        continue;
>      }
>   }
> @@ -2481,6 +2472,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
>   }
>      }
> 
> +
>    /* End loop-exit-fixes after versioning.  */
> 
>    if (cond_expr_stmt_list)
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 292e771..148f9f1 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -5572,6 +5572,49 @@ vect_loop_kill_debug_uses (struct loop *loop,
> gimple stmt)
>      }
>  }
> 
> +/* Find all loop invariants detected after alias checks, and hoist them
> +   before the loop preheader.  */
> +
> +static void
> +vect_hoist_loop_invariants (loop_vec_info loop_vinfo)
> +{
> +  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> +  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
> +  gimple_seq loop_invariants = NULL;
> +
> +  for (int i = 0; i < (int)loop->num_nodes; i++)
> +    {
> +      basic_block bb = bbs[i];
> +      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
> + {
> +  gimple stmt = gsi_stmt (si);
> +  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
> +  if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
> +    {
> +      if (gimple_has_mem_ops (stmt))
> + gimple_set_vuse (stmt, NULL);
> +
> +      gsi_remove (&si, false);
> +      gimple_seq_add_stmt (&loop_invariants, stmt);
> +
> +      if (dump_enabled_p ())
> + {
> +  dump_printf_loc
> +    (MSG_NOTE, vect_location,
> +     "hoisting out of the vectorized loop: ");
> +  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
> +  dump_printf (MSG_NOTE, "\n");
> + }
> +    }
> +  else
> +    gsi_next (&si);
> + }
> +    }
> +  basic_block pre_header = loop_preheader_edge (loop)->src;
> +  gcc_assert (EDGE_COUNT (pre_header->preds) == 1);
> +  gsi_insert_seq_on_edge_immediate (EDGE_PRED (pre_header, 0),
> loop_invariants);
> +}
> +
>  /* Function vect_transform_loop.
> 
>     The analysis phase has determined that the loop is vectorizable.
> @@ -5835,6 +5878,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
>   transform_pattern_stmt = false;
>              }
> 
> +          /* If stmt is a loop invariant (detected after alias checks),
> +             do not generate the vectorized stmt for it as it will be
> +             hoisted later.  */
> +  if (STMT_VINFO_LOOP_INVARIANT_P (stmt_info))
> +    {
> +      gsi_next (&si);
> +      continue;
> +    }
> +
>    gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
>    nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
>                                                 STMT_VINFO_VECTYPE (stmt_info));
> @@ -5910,6 +5962,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
>   }        /* stmts in BB */
>      } /* BBs in loop */
> 
> +  /* Hoist all loop invariants.  */
> +  vect_hoist_loop_invariants (loop_vinfo);
> +
>    slpeel_make_loop_iterate_ntimes (loop, ratio);
> 
>    /* Reduce loop iterations by the vectorization factor.  */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index b0e0fa9..3e15372 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1362,6 +1362,18 @@ vect_get_vec_def_for_operand (tree op, gimple
> stmt, tree *scalar_def)
>          }
>      }
> 
> +  /* After alias checks, some loop invariants may be detected, and we won't
> +     generate vectorized stmts for them.  We only hoist them after all stmts
> +     are vectorized.  Here if we meet a loop invariant, we need to assume it
> +     is already hoisted before the loop.  We do this by setting the def-type
> +     to vect_external_def.  */
> +  if (def_stmt && dt == vect_internal_def)
> +    {
> +      stmt_vec_info stmt_vinfo = vinfo_for_stmt (def_stmt);
> +      if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
> + dt = vect_external_def;
> +    }
> +
>    switch (dt)
>      {
>      /* Case 1: operand is a constant.  */
> @@ -6083,6 +6095,7 @@ new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo,
>    STMT_VINFO_BB_VINFO (res) = bb_vinfo;
>    STMT_VINFO_RELEVANT (res) = vect_unused_in_scope;
>    STMT_VINFO_LIVE_P (res) = false;
> +  STMT_VINFO_LOOP_INVARIANT_P (res) = false;
>    STMT_VINFO_VECTYPE (res) = NULL;
>    STMT_VINFO_VEC_STMT (res) = NULL;
>    STMT_VINFO_VECTORIZABLE (res) = true;
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index bbd50e1..2c230f9 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -516,6 +516,10 @@ typedef struct _stmt_vec_info {
>       used outside the loop.  */
>    bool live;
> 
> +  /* Indicates whether this stmt is a loop invariant, which can be hoisted.
> +     A stmt may become loop invariant after alias checks.  */
> +  bool loop_invariant;
> +
>    /* Stmt is part of some pattern (computation idiom)  */
>    bool in_pattern_p;
> 
> @@ -623,6 +627,7 @@ typedef struct _stmt_vec_info {
>  #define STMT_VINFO_BB_VINFO(S)             (S)->bb_vinfo
>  #define STMT_VINFO_RELEVANT(S)             (S)->relevant
>  #define STMT_VINFO_LIVE_P(S)               (S)->live
> +#define STMT_VINFO_LOOP_INVARIANT_P(S)     (S)->loop_invariant
>  #define STMT_VINFO_VECTYPE(S)              (S)->vectype
>  #define STMT_VINFO_VEC_STMT(S)             (S)->vectorized_stmt
>  #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2013-11-27 11:37 ` Richard Biener
@ 2013-11-27 12:45   ` Jakub Jelinek
  2013-11-27 14:26     ` Richard Biener
  2014-01-13 13:38     ` Richard Biener
  2013-11-28  4:39   ` Cong Hou
  1 sibling, 2 replies; 15+ messages in thread
From: Jakub Jelinek @ 2013-11-27 12:45 UTC (permalink / raw)
  To: Richard Biener; +Cc: Cong Hou, GCC Patches

On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote:
> Hmm.  I'm still thinking that we should handle this during the regular
> transform step.

I wonder if it can't be done instead just in vectorizable_load,
if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is
invariant, just emit the (broadcasted) load not inside of the loop, but on
the loop preheader edge.

> *** gcc/tree-vect-data-refs.c	(revision 205435)
> --- gcc/tree-vect-data-refs.c	(working copy)
> *************** again:
> *** 3668,3673 ****
> --- 3668,3682 ----
>   	    }
>   	  STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
>   	}
> +       else if (loop_vinfo
> + 	       && integer_zerop (DR_STEP (dr)))
> + 	{
> + 	  /* All loads from a non-varying address will be disambiguated
> + 	     by data-ref analysis or via a runtime alias check and thus
> + 	     they will become invariant.  Force them to be vectorized
> + 	     as external.  */
> + 	  STMT_VINFO_DEF_TYPE (stmt_info) = vect_external_def;
> + 	}

I think this is unsafe for simd loops.
I'd say:
int a[1024], b[1024];

int foo (void)
{
  int i;
  #pragma omp simd safelen(8)
  for (i = 0; i < 1024; i++)
    {
      a[i] = i;
      b[i] = a[0];
    }
}

is valid testcase, the loop behaves the same if you execute it
sequentially, or vectorize using SIMD (hardware or emulated) instructions
with vectorization factor of 2, 4 or 8, as long as you do all memory
operations (either using scalar insns or simd instructions) in the order
they were written, which I believe the vectorizer right now handles
correctly, but the hoisting this patch wants to perform is not fine,
unless data ref analysis would prove that it can't alias.  For non-simd
loops we of course perform that data ref analysis and either version for
alias, or prove that the drs can't alias, but for simd loops we take as
given that the loop is safe to be vectorized.  It is, but not for hoisting.

So, the above say with emulated SIMD is safe to be executed as:
  for (i = 0; i < 1024; i += 8)
    {
      int tmp;
      for (tmp = i; tmp < i + 8; tmp++)
	a[tmp] = tmp;
      for (tmp = i; tmp < i + 8; tmp++)
	b[tmp] = a[0];
    }
but not as:
  int tempa[8], tmp;
  /* Hoisted HW or emulated load + splat.  */
  for (tmp = 0; tmp < 8; tmp++)
    tempa[tmp] = a[0];
  for (i = 0; i < 1024; i += 8)
    {
      for (tmp = i; tmp < i + 8; tmp++)
	a[tmp] = tmp;
      for (tmp = i; tmp < i + 8; tmp++)
	b[tmp] = tempa[tmp];
    }

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2013-11-27 12:45   ` Jakub Jelinek
@ 2013-11-27 14:26     ` Richard Biener
  2013-11-27 15:35       ` Jakub Jelinek
  2014-01-13 13:38     ` Richard Biener
  1 sibling, 1 reply; 15+ messages in thread
From: Richard Biener @ 2013-11-27 14:26 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Cong Hou, GCC Patches

On Wed, 27 Nov 2013, Jakub Jelinek wrote:

> On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote:
> > Hmm.  I'm still thinking that we should handle this during the regular
> > transform step.
> 
> I wonder if it can't be done instead just in vectorizable_load,
> if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is
> invariant, just emit the (broadcasted) load not inside of the loop, but on
> the loop preheader edge.

It is safe even for !LOOP_REQUIRES_VERSIONING_FOR_ALIAS.  It's just
a missed optimization I even noted when originally implementing
support for invariant loads ...

> > *** gcc/tree-vect-data-refs.c	(revision 205435)
> > --- gcc/tree-vect-data-refs.c	(working copy)
> > *************** again:
> > *** 3668,3673 ****
> > --- 3668,3682 ----
> >   	    }
> >   	  STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
> >   	}
> > +       else if (loop_vinfo
> > + 	       && integer_zerop (DR_STEP (dr)))
> > + 	{
> > + 	  /* All loads from a non-varying address will be disambiguated
> > + 	     by data-ref analysis or via a runtime alias check and thus
> > + 	     they will become invariant.  Force them to be vectorized
> > + 	     as external.  */
> > + 	  STMT_VINFO_DEF_TYPE (stmt_info) = vect_external_def;
> > + 	}
> 
> I think this is unsafe for simd loops.
> I'd say:
> int a[1024], b[1024];
> 
> int foo (void)
> {
>   int i;
>   #pragma omp simd safelen(8)
>   for (i = 0; i < 1024; i++)
>     {
>       a[i] = i;
>       b[i] = a[0];
>     }
> }
> 
> is valid testcase, the loop behaves the same if you execute it
> sequentially, or vectorize using SIMD (hardware or emulated) instructions
> with vectorization factor of 2, 4 or 8, as long as you do all memory
> operations (either using scalar insns or simd instructions) in the order
> they were written, which I believe the vectorizer right now handles
> correctly, but the hoisting this patch wants to perform is not fine,
> unless data ref analysis would prove that it can't alias.  For non-simd
> loops we of course perform that data ref analysis and either version for
> alias, or prove that the drs can't alias, but for simd loops we take as
> given that the loop is safe to be vectorized.  It is, but not for hoisting.

Ick.  I hate this behind-the-back stuff - so safelen doesn't mean
that a[i] and a[0] do not alias.  Note that this will break with
SLP stuff at least as that will re-order reads/writes.  Not sure
how safelen applies to SLP though.  That is

    a[i] = i;
    b[i] = a[0];
    a[i+1] = i+1;
    b[i+1] = a[1];

will eventually end up re-ordering reads/writes in non-obvious
ways.

> So, the above say with emulated SIMD is safe to be executed as:
>   for (i = 0; i < 1024; i += 8)
>     {
>       int tmp;
>       for (tmp = i; tmp < i + 8; tmp++)
> 	a[tmp] = tmp;
>       for (tmp = i; tmp < i + 8; tmp++)
> 	b[tmp] = a[0];
>     }
> but not as:
>   int tempa[8], tmp;
>   /* Hoisted HW or emulated load + splat.  */
>   for (tmp = 0; tmp < 8; tmp++)
>     tempa[tmp] = a[0];
>   for (i = 0; i < 1024; i += 8)
>     {
>       for (tmp = i; tmp < i + 8; tmp++)
> 	a[tmp] = tmp;
>       for (tmp = i; tmp < i + 8; tmp++)
> 	b[tmp] = tempa[tmp];
>     }
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2013-11-27 14:26     ` Richard Biener
@ 2013-11-27 15:35       ` Jakub Jelinek
  2013-11-27 16:08         ` Richard Biener
  0 siblings, 1 reply; 15+ messages in thread
From: Jakub Jelinek @ 2013-11-27 15:35 UTC (permalink / raw)
  To: Richard Biener; +Cc: Cong Hou, GCC Patches

On Wed, Nov 27, 2013 at 12:54:14PM +0100, Richard Biener wrote:
> On Wed, 27 Nov 2013, Jakub Jelinek wrote:
> 
> > On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote:
> > > Hmm.  I'm still thinking that we should handle this during the regular
> > > transform step.
> > 
> > I wonder if it can't be done instead just in vectorizable_load,
> > if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is
> > invariant, just emit the (broadcasted) load not inside of the loop, but on
> > the loop preheader edge.
> 
> It is safe even for !LOOP_REQUIRES_VERSIONING_FOR_ALIAS.  It's just
> a missed optimization I even noted when originally implementing
> support for invariant loads ...

True, but only for non-simd loops, or if we proved it by looking at all
relevant LOOP_VINFO_DDRSs.  But, if it is not a simd loop, and
not !LOOP_REQUIRES_VERSIONING_FOR_ALIAS, wouldn't previous optimizations
hoist the load before the loop already?

> Ick.  I hate this behind-the-back stuff - so safelen doesn't mean
> that a[i] and a[0] do not alias.

My initial understanding of the SIMD loops was also that it allows the
the up to safelen consecutive iterations to be randomly reordered or
intermixed without affecting valid programs, but further mails from Tobias
and others on this topic plus testcases changed my understanding of it.

Note that we don't purge LOOP_VINFO_DDRSs in any way for loop->safelen,
just don't add versioning for aliasor punt if there is some possible (or
proven) aliasing.  Perhaps we could add a bool flag to loop_vinfo which
would tell us whether the loop has no data dependencies at all (i.e.
either for non-safelen is !LOOP_REQUIRES_VERSIONING_FOR_ALIAS, or
with safelen non-zero would be !LOOP_REQUIRES_VERSIONING_FOR_ALIAS).
Then we could hoist if that flag is set or
LOOP_REQUIRES_VERSIONING_FOR_ALIAS (because then the runtime test
checks the dependency).

> Note that this will break with
> SLP stuff at least as that will re-order reads/writes.  Not sure
> how safelen applies to SLP though.  That is
> 
>     a[i] = i;
>     b[i] = a[0];
>     a[i+1] = i+1;
>     b[i+1] = a[1];
> 
> will eventually end up re-ordering reads/writes in non-obvious
> ways.

You mean SLP inside of loop vectorization, right?  Because for normal SLP
outside of loop vectorizer simdlen is ignored and normal data ref is
performed without any bypassing.

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2013-11-27 15:35       ` Jakub Jelinek
@ 2013-11-27 16:08         ` Richard Biener
  0 siblings, 0 replies; 15+ messages in thread
From: Richard Biener @ 2013-11-27 16:08 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Cong Hou, GCC Patches

On Wed, 27 Nov 2013, Jakub Jelinek wrote:

> On Wed, Nov 27, 2013 at 12:54:14PM +0100, Richard Biener wrote:
> > On Wed, 27 Nov 2013, Jakub Jelinek wrote:
> > 
> > > On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote:
> > > > Hmm.  I'm still thinking that we should handle this during the regular
> > > > transform step.
> > > 
> > > I wonder if it can't be done instead just in vectorizable_load,
> > > if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is
> > > invariant, just emit the (broadcasted) load not inside of the loop, but on
> > > the loop preheader edge.
> > 
> > It is safe even for !LOOP_REQUIRES_VERSIONING_FOR_ALIAS.  It's just
> > a missed optimization I even noted when originally implementing
> > support for invariant loads ...
> 
> True, but only for non-simd loops, or if we proved it by looking at all
> relevant LOOP_VINFO_DDRSs.  But, if it is not a simd loop, and
> not !LOOP_REQUIRES_VERSIONING_FOR_ALIAS, wouldn't previous optimizations
> hoist the load before the loop already?

Well - there is the case of

int g;
int *p;

  for (;;)
    {
      *p = g;
    }

where we know in the _vectorized_ loop body that they cannot alias
(because we have at least two vector elements).  But it seems
we lost that optimization at some point as

int g;
void foo (int *p, int n)
{
  int i;
  for (i = 0; i < n; ++i)
    *p = g;
}

uses alias versioning.  Bah ;)  Maybe I'm just dreaming that
I implemented that as well.

> > Ick.  I hate this behind-the-back stuff - so safelen doesn't mean
> > that a[i] and a[0] do not alias.
> 
> My initial understanding of the SIMD loops was also that it allows the
> the up to safelen consecutive iterations to be randomly reordered or
> intermixed without affecting valid programs, but further mails from Tobias
> and others on this topic plus testcases changed my understanding of it.
> 
> Note that we don't purge LOOP_VINFO_DDRSs in any way for loop->safelen,
> just don't add versioning for aliasor punt if there is some possible (or
> proven) aliasing.  Perhaps we could add a bool flag to loop_vinfo which
> would tell us whether the loop has no data dependencies at all (i.e.
> either for non-safelen is !LOOP_REQUIRES_VERSIONING_FOR_ALIAS, or
> with safelen non-zero would be !LOOP_REQUIRES_VERSIONING_FOR_ALIAS).
> Then we could hoist if that flag is set or
> LOOP_REQUIRES_VERSIONING_FOR_ALIAS (because then the runtime test
> checks the dependency).
> 
> > Note that this will break with
> > SLP stuff at least as that will re-order reads/writes.  Not sure
> > how safelen applies to SLP though.  That is
> > 
> >     a[i] = i;
> >     b[i] = a[0];
> >     a[i+1] = i+1;
> >     b[i+1] = a[1];
> > 
> > will eventually end up re-ordering reads/writes in non-obvious
> > ways.
> 
> You mean SLP inside of loop vectorization, right?  Because for normal SLP
> outside of loop vectorizer simdlen is ignored and normal data ref is
> performed without any bypassing.

Yes, SLP inside of a loop.

Richard.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2013-11-27 11:37 ` Richard Biener
  2013-11-27 12:45   ` Jakub Jelinek
@ 2013-11-28  4:39   ` Cong Hou
  1 sibling, 0 replies; 15+ messages in thread
From: Cong Hou @ 2013-11-28  4:39 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

On Wed, Nov 27, 2013 at 1:53 AM, Richard Biener <rguenther@suse.de> wrote:
> On Fri, 22 Nov 2013, Cong Hou wrote:
>
>> Hi
>>
>> Currently in GCC vectorization, some loop invariant may be detected
>> after aliasing checks, which can be hoisted outside of the loop. The
>> current method in GCC may break the information built during the
>> analysis phase, causing some crash (see PR59006 and PR58921).
>>
>> This patch improves the loop invariant hoisting by delaying it until
>> all statements are vectorized, thereby keeping all built information.
>> But those loop invariant statements won't be vectorized, and if a
>> variable is defined by one of those loop invariant, it is treated as
>> an external definition.
>>
>> Bootstrapped and testes on an x86-64 machine.
>
> Hmm.  I'm still thinking that we should handle this during the regular
> transform step.
>
> Like with the following incomplete patch.  Missing is adjusting
> the rest of the vectorizable_* functions to handle the case where all defs
> are dt_external or constant by setting their own STMT_VINFO_DEF_TYPE to
> dt_external.  From the gcc.dg/vect/pr58508.c we get only 4 hoists
> instead of 8 because of this (I think).
>
> Also gcc.dg/vect/pr52298.c ICEs for yet unanalyzed reason.
>
> I can take over the bug if you like.
>
> Thanks,
> Richard.
>
> Index: gcc/tree-vect-data-refs.c
> ===================================================================
> *** gcc/tree-vect-data-refs.c   (revision 205435)
> --- gcc/tree-vect-data-refs.c   (working copy)
> *************** again:
> *** 3668,3673 ****
> --- 3668,3682 ----
>             }
>           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
>         }
> +       else if (loop_vinfo
> +              && integer_zerop (DR_STEP (dr)))
> +       {
> +         /* All loads from a non-varying address will be disambiguated
> +            by data-ref analysis or via a runtime alias check and thus
> +            they will become invariant.  Force them to be vectorized
> +            as external.  */
> +         STMT_VINFO_DEF_TYPE (stmt_info) = vect_external_def;
> +       }
>       }
>
>     /* If we stopped analysis at the first dataref we could not analyze


I agree that setting the statement that loads a data-ref with zero
step as vect_external_def early at this point is a good idea. This
avoids two loop analyses seeing inconsistent def-info if we do this
later. Note with this change the following loop in PR59006 will not be
vectorized:


int a[8], b;

void fn1(void) {
  int c;
  for (; b; b++) {
    int d = a[b];
    c = a[0] ? d : 0;
    a[b] = c;
  }
}

This is because the load to a[0] is now treated as an external def, in
which case vectype cannot be found for the condition of the
conditional expression, while vectorizable_condition requires that
comp_vectype should be set properly. We can treat it as a missed
optimization.



> Index: gcc/tree-vect-loop-manip.c
> ===================================================================
> *** gcc/tree-vect-loop-manip.c  (revision 205435)
> --- gcc/tree-vect-loop-manip.c  (working copy)
> *************** vect_loop_versioning (loop_vec_info loop
> *** 2269,2275 ****
>
>     /* Extract load statements on memrefs with zero-stride accesses.  */
>
> !   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>       {
>         /* In the loop body, we iterate each statement to check if it is a load.
>          Then we check the DR_STEP of the data reference.  If DR_STEP is zero,
> --- 2269,2275 ----
>
>     /* Extract load statements on memrefs with zero-stride accesses.  */
>
> !   if (0 && LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>       {
>         /* In the loop body, we iterate each statement to check if it is a load.
>          Then we check the DR_STEP of the data reference.  If DR_STEP is zero,
> Index: gcc/tree-vect-loop.c
> ===================================================================
> *** gcc/tree-vect-loop.c        (revision 205435)
> --- gcc/tree-vect-loop.c        (working copy)
> *************** vect_transform_loop (loop_vec_info loop_
> *** 5995,6000 ****
> --- 5995,6020 ----
>                 }
>             }
>
> +         /* If the stmt is loop invariant simply move it.  */
> +         if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_external_def)
> +           {
> +             if (dump_enabled_p ())
> +               {
> +                 dump_printf_loc (MSG_NOTE, vect_location,
> +                                  "hoisting out of the vectorized loop: ");
> +                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
> +                 dump_printf (MSG_NOTE, "\n");
> +               }
> +             gsi_remove (&si, false);
> +             if (gimple_vuse (stmt))
> +               gimple_set_vuse (stmt, NULL);
> +             basic_block new_bb;
> +             new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
> +                                                    stmt);
> +             gcc_assert (!new_bb);
> +             continue;
> +           }
> +
>           /* -------- vectorize statement ------------ */
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> *** gcc/tree-vect-stmts.c       (revision 205435)
> --- gcc/tree-vect-stmts.c       (working copy)
> *************** vectorizable_operation (gimple stmt, gim
> *** 3497,3502 ****
> --- 3497,3503 ----
>     vec<tree> vec_oprnds2 = vNULL;
>     tree vop0, vop1, vop2;
>     bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
> +   bool all_ops_external = true;
>     int vf;
>
>     if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
> *************** vectorizable_operation (gimple stmt, gim
> *** 3557,3562 ****
> --- 3558,3565 ----
>                            "use not simple.\n");
>         return false;
>       }
> +   if (dt[0] != vect_external_def && dt[0] != vect_constant_def)
> +     all_ops_external = false;
>     /* If op0 is an external or constant def use a vector type with
>        the same size as the output vector type.  */
>     if (!vectype)
> *************** vectorizable_operation (gimple stmt, gim
> *** 3593,3598 ****
> --- 3596,3603 ----
>                                "use not simple.\n");
>           return false;
>         }
> +       if (dt[1] != vect_external_def && dt[1] != vect_constant_def)
> +       all_ops_external = false;
>       }
>     if (op_type == ternary_op)
>       {
> *************** vectorizable_operation (gimple stmt, gim
> *** 3605,3610 ****
> --- 3610,3623 ----
>                                "use not simple.\n");
>           return false;
>         }
> +       if (dt[2] != vect_external_def && dt[2] != vect_constant_def)
> +       all_ops_external = false;
> +     }
> +
> +   if (all_ops_external && loop_vinfo)
> +     {
> +       STMT_VINFO_DEF_TYPE (stmt_info) = vect_external_def;
> +       return true;
>       }
>
>     if (loop_vinfo)
> *************** vect_analyze_stmt (gimple stmt, bool *ne
> *** 5771,5779 ****
>                        || relevance == vect_unused_in_scope));
>            break;
>
>         case vect_induction_def:
>         case vect_constant_def:
> -       case vect_external_def:
>         case vect_unknown_def_type:
>         default:
>           gcc_unreachable ();
> --- 5784,5795 ----
>                        || relevance == vect_unused_in_scope));
>            break;
>
> +       case vect_external_def:
> +       /* If we decided a stmt is invariant don't bother to vectorize it.  */
> +       return true;
> +
>         case vect_induction_def:
>         case vect_constant_def:
>         case vect_unknown_def_type:
>         default:
>           gcc_unreachable ();
>

In this manner all other loop invariants are detected in the
transformation phase. I am not sure if it is tedious to do this in
every vectorizable_* function. We could also do this just after
detecting those invariant loads during data-ref analysis. What do you
think?

Please also feel free to take this bug. I have a coming trip and could
not work on it in five days.

Thank you very much!


Cong


>
>>
>> thanks,
>> Cong
>>
>>
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index 2c0554b..0614bab 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,18 @@
>> +2013-11-22  Cong Hou  <congh@google.com>
>> +
>> + PR tree-optimization/58921
>> + PR tree-optimization/59006
>> + * tree-vectorizer.h (struct _stmt_vec_info): New data member
>> + loop_invariant.
>> + * tree-vect-loop-manip.c (vect_loop_versioning): Delay hoisting loop
>> + invariants until all statements are vectorized.
>> + * tree-vect-loop.c (vect_hoist_loop_invariants): New functions.
>> + (vect_transform_loop): Hoist loop invariants after all statements
>> + are vectorized.  Do not vectorize loop invariants stmts.
>> + * tree-vect-stmts.c (vect_get_vec_def_for_operand): Treat a loop
>> + invariant as an external definition.
>> + (new_stmt_vec_info): Initialize new data member.
>> +
>>  2013-11-12  Jeff Law  <law@redhat.com>
>>
>>   * tree-ssa-threadedge.c (thread_around_empty_blocks): New
>> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
>> index 09c7f20..447625b 100644
>> --- a/gcc/testsuite/ChangeLog
>> +++ b/gcc/testsuite/ChangeLog
>> @@ -1,3 +1,10 @@
>> +2013-11-22  Cong Hou  <congh@google.com>
>> +
>> + PR tree-optimization/58921
>> + PR tree-optimization/59006
>> + * gcc.dg/vect/pr58921.c: New test.
>> + * gcc.dg/vect/pr59006.c: New test.
>> +
>>  2013-11-12  Balaji V. Iyer  <balaji.v.iyer@intel.com>
>>
>>   * gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
>> diff --git a/gcc/testsuite/gcc.dg/vect/pr58921.c
>> b/gcc/testsuite/gcc.dg/vect/pr58921.c
>> new file mode 100644
>> index 0000000..ee3694a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/pr58921.c
>> @@ -0,0 +1,15 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_int } */
>> +
>> +int a[7];
>> +int b;
>> +
>> +void
>> +fn1 ()
>> +{
>> +  for (; b; b++)
>> +    a[b] = ((a[b] <= 0) == (a[0] != 0));
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>> diff --git a/gcc/testsuite/gcc.dg/vect/pr59006.c
>> b/gcc/testsuite/gcc.dg/vect/pr59006.c
>> new file mode 100644
>> index 0000000..95d90a9
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/pr59006.c
>> @@ -0,0 +1,24 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_int } */
>> +
>> +int a[8], b;
>> +
>> +void fn1 (void)
>> +{
>> +  int c;
>> +  for (; b; b++)
>> +    {
>> +      int d = a[b];
>> +      c = a[0] ? d : 0;
>> +      a[b] = c;
>> +    }
>> +}
>> +
>> +void fn2 ()
>> +{
>> +  for (; b <= 0; b++)
>> +    a[b] = a[0] || b;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
>> index 15227856..3adc73d 100644
>> --- a/gcc/tree-vect-loop-manip.c
>> +++ b/gcc/tree-vect-loop-manip.c
>> @@ -2448,8 +2448,12 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
>>    FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
>>      {
>>        gimple def = SSA_NAME_DEF_STMT (var);
>> +      stmt_vec_info def_stmt_info;
>> +
>>        if (!gimple_nop_p (def)
>> -  && flow_bb_inside_loop_p (loop, gimple_bb (def)))
>> +  && flow_bb_inside_loop_p (loop, gimple_bb (def))
>> +  && !((def_stmt_info = vinfo_for_stmt (def))
>> + && STMT_VINFO_LOOP_INVARIANT_P (def_stmt_info)))
>>   {
>>    hoist = false;
>>    break;
>> @@ -2458,21 +2462,8 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
>>
>>    if (hoist)
>>      {
>> -      if (dr)
>> - gimple_set_vuse (stmt, NULL);
>> -
>> -      gsi_remove (&si, false);
>> -      gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
>> -    stmt);
>> -
>> -      if (dump_enabled_p ())
>> - {
>> -  dump_printf_loc
>> -      (MSG_NOTE, vect_location,
>> -       "hoisting out of the vectorized loop: ");
>> -  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
>> -  dump_printf (MSG_NOTE, "\n");
>> - }
>> +      STMT_VINFO_LOOP_INVARIANT_P (stmt_info) = true;
>> +      gsi_next (&si);
>>        continue;
>>      }
>>   }
>> @@ -2481,6 +2472,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
>>   }
>>      }
>>
>> +
>>    /* End loop-exit-fixes after versioning.  */
>>
>>    if (cond_expr_stmt_list)
>> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> index 292e771..148f9f1 100644
>> --- a/gcc/tree-vect-loop.c
>> +++ b/gcc/tree-vect-loop.c
>> @@ -5572,6 +5572,49 @@ vect_loop_kill_debug_uses (struct loop *loop,
>> gimple stmt)
>>      }
>>  }
>>
>> +/* Find all loop invariants detected after alias checks, and hoist them
>> +   before the loop preheader.  */
>> +
>> +static void
>> +vect_hoist_loop_invariants (loop_vec_info loop_vinfo)
>> +{
>> +  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>> +  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
>> +  gimple_seq loop_invariants = NULL;
>> +
>> +  for (int i = 0; i < (int)loop->num_nodes; i++)
>> +    {
>> +      basic_block bb = bbs[i];
>> +      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
>> + {
>> +  gimple stmt = gsi_stmt (si);
>> +  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
>> +  if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
>> +    {
>> +      if (gimple_has_mem_ops (stmt))
>> + gimple_set_vuse (stmt, NULL);
>> +
>> +      gsi_remove (&si, false);
>> +      gimple_seq_add_stmt (&loop_invariants, stmt);
>> +
>> +      if (dump_enabled_p ())
>> + {
>> +  dump_printf_loc
>> +    (MSG_NOTE, vect_location,
>> +     "hoisting out of the vectorized loop: ");
>> +  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
>> +  dump_printf (MSG_NOTE, "\n");
>> + }
>> +    }
>> +  else
>> +    gsi_next (&si);
>> + }
>> +    }
>> +  basic_block pre_header = loop_preheader_edge (loop)->src;
>> +  gcc_assert (EDGE_COUNT (pre_header->preds) == 1);
>> +  gsi_insert_seq_on_edge_immediate (EDGE_PRED (pre_header, 0),
>> loop_invariants);
>> +}
>> +
>>  /* Function vect_transform_loop.
>>
>>     The analysis phase has determined that the loop is vectorizable.
>> @@ -5835,6 +5878,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
>>   transform_pattern_stmt = false;
>>              }
>>
>> +          /* If stmt is a loop invariant (detected after alias checks),
>> +             do not generate the vectorized stmt for it as it will be
>> +             hoisted later.  */
>> +  if (STMT_VINFO_LOOP_INVARIANT_P (stmt_info))
>> +    {
>> +      gsi_next (&si);
>> +      continue;
>> +    }
>> +
>>    gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
>>    nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
>>                                                 STMT_VINFO_VECTYPE (stmt_info));
>> @@ -5910,6 +5962,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
>>   }        /* stmts in BB */
>>      } /* BBs in loop */
>>
>> +  /* Hoist all loop invariants.  */
>> +  vect_hoist_loop_invariants (loop_vinfo);
>> +
>>    slpeel_make_loop_iterate_ntimes (loop, ratio);
>>
>>    /* Reduce loop iterations by the vectorization factor.  */
>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>> index b0e0fa9..3e15372 100644
>> --- a/gcc/tree-vect-stmts.c
>> +++ b/gcc/tree-vect-stmts.c
>> @@ -1362,6 +1362,18 @@ vect_get_vec_def_for_operand (tree op, gimple
>> stmt, tree *scalar_def)
>>          }
>>      }
>>
>> +  /* After alias checks, some loop invariants may be detected, and we won't
>> +     generate vectorized stmts for them.  We only hoist them after all stmts
>> +     are vectorized.  Here if we meet a loop invariant, we need to assume it
>> +     is already hoisted before the loop.  We do this by setting the def-type
>> +     to vect_external_def.  */
>> +  if (def_stmt && dt == vect_internal_def)
>> +    {
>> +      stmt_vec_info stmt_vinfo = vinfo_for_stmt (def_stmt);
>> +      if (stmt_vinfo && STMT_VINFO_LOOP_INVARIANT_P (stmt_vinfo))
>> + dt = vect_external_def;
>> +    }
>> +
>>    switch (dt)
>>      {
>>      /* Case 1: operand is a constant.  */
>> @@ -6083,6 +6095,7 @@ new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo,
>>    STMT_VINFO_BB_VINFO (res) = bb_vinfo;
>>    STMT_VINFO_RELEVANT (res) = vect_unused_in_scope;
>>    STMT_VINFO_LIVE_P (res) = false;
>> +  STMT_VINFO_LOOP_INVARIANT_P (res) = false;
>>    STMT_VINFO_VECTYPE (res) = NULL;
>>    STMT_VINFO_VEC_STMT (res) = NULL;
>>    STMT_VINFO_VECTORIZABLE (res) = true;
>> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> index bbd50e1..2c230f9 100644
>> --- a/gcc/tree-vectorizer.h
>> +++ b/gcc/tree-vectorizer.h
>> @@ -516,6 +516,10 @@ typedef struct _stmt_vec_info {
>>       used outside the loop.  */
>>    bool live;
>>
>> +  /* Indicates whether this stmt is a loop invariant, which can be hoisted.
>> +     A stmt may become loop invariant after alias checks.  */
>> +  bool loop_invariant;
>> +
>>    /* Stmt is part of some pattern (computation idiom)  */
>>    bool in_pattern_p;
>>
>> @@ -623,6 +627,7 @@ typedef struct _stmt_vec_info {
>>  #define STMT_VINFO_BB_VINFO(S)             (S)->bb_vinfo
>>  #define STMT_VINFO_RELEVANT(S)             (S)->relevant
>>  #define STMT_VINFO_LIVE_P(S)               (S)->live
>> +#define STMT_VINFO_LOOP_INVARIANT_P(S)     (S)->loop_invariant
>>  #define STMT_VINFO_VECTYPE(S)              (S)->vectype
>>  #define STMT_VINFO_VEC_STMT(S)             (S)->vectorized_stmt
>>  #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE / SUSE Labs
> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2013-11-27 12:45   ` Jakub Jelinek
  2013-11-27 14:26     ` Richard Biener
@ 2014-01-13 13:38     ` Richard Biener
  2014-01-13 13:45       ` Jakub Jelinek
                         ` (2 more replies)
  1 sibling, 3 replies; 15+ messages in thread
From: Richard Biener @ 2014-01-13 13:38 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Cong Hou, GCC Patches

On Wed, 27 Nov 2013, Jakub Jelinek wrote:

> On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote:
> > Hmm.  I'm still thinking that we should handle this during the regular
> > transform step.
> 
> I wonder if it can't be done instead just in vectorizable_load,
> if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is
> invariant, just emit the (broadcasted) load not inside of the loop, but on
> the loop preheader edge.

So this implements this suggestion, XFAILing the no longer handled cases.
For example we get

  _94 = *b_8(D);
  vect_cst_.18_95 = {_94, _94, _94, _94};
  _99 = prolog_loop_adjusted_niters.9_132 * 4;
  vectp_a.22_98 = a_6(D) + _99;
  ivtmp.43_77 = (unsigned long) vectp_a.22_98;

  <bb 13>:
  # ivtmp.41_67 = PHI <ivtmp.41_70(3), 0(12)>
  # ivtmp.43_71 = PHI <ivtmp.43_69(3), ivtmp.43_77(12)>
  vect__10.19_97 = vect_cst_.18_95 + { 1, 1, 1, 1 };
  _76 = (void *) ivtmp.43_71;
  MEM[base: _76, offset: 0B] = vect__10.19_97;

...

instead of having hoisted *b_8 + 1 as scalar computation.  Not sure
why LIM doesn't hoist the vector variant later.

vect__10.19_97 = vect_cst_.18_95 + vect_cst_.20_96;
  invariant up to level 1, cost 1.

ah, the cost thing.  Should be "improved" to see that hoisting
reduces the number of live SSA names in the loop.

Eventually lower_vector_ssa could optimize vector to scalar
code again ... (ick).

Bootstrap / regtest running on x86_64.

Comments?

Thanks,
Richard.

2014-01-13  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/58921
	PR tree-optimization/59006
	* tree-vect-loop-manip.c (vect_loop_versioning): Remove code
	hoisting invariant stmts.
	* tree-vect-stmts.c (vectorizable_load): Insert the splat of
	invariant loads on the preheader edge if possible.

	* gcc.dg/torture/pr58921.c: New testcase.
	* gcc.dg/torture/pr59006.c: Likewise.
	* gcc.dg/vect/pr58508.c: XFAIL no longer handled cases.

Index: gcc/tree-vect-loop-manip.c
===================================================================
*** gcc/tree-vect-loop-manip.c	(revision 206576)
--- gcc/tree-vect-loop-manip.c	(working copy)
*************** vect_loop_versioning (loop_vec_info loop
*** 2435,2507 ****
  	}
      }
  
- 
-   /* Extract load statements on memrefs with zero-stride accesses.  */
- 
-   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
-     {
-       /* In the loop body, we iterate each statement to check if it is a load.
- 	 Then we check the DR_STEP of the data reference.  If DR_STEP is zero,
- 	 then we will hoist the load statement to the loop preheader.  */
- 
-       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
-       int nbbs = loop->num_nodes;
- 
-       for (int i = 0; i < nbbs; ++i)
- 	{
- 	  for (gimple_stmt_iterator si = gsi_start_bb (bbs[i]);
- 	       !gsi_end_p (si);)
- 	    {
- 	      gimple stmt = gsi_stmt (si);
- 	      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
- 	      struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- 
- 	      if (is_gimple_assign (stmt)
- 		  && (!dr
- 		      || (DR_IS_READ (dr) && integer_zerop (DR_STEP (dr)))))
- 		{
- 		  bool hoist = true;
- 		  ssa_op_iter iter;
- 		  tree var;
- 
- 		  /* We hoist a statement if all SSA uses in it are defined
- 		     outside of the loop.  */
- 		  FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
- 		    {
- 		      gimple def = SSA_NAME_DEF_STMT (var);
- 		      if (!gimple_nop_p (def)
- 			  && flow_bb_inside_loop_p (loop, gimple_bb (def)))
- 			{
- 			  hoist = false;
- 			  break;
- 			}
- 		    }
- 
- 		  if (hoist)
- 		    {
- 		      if (dr)
- 			gimple_set_vuse (stmt, NULL);
- 
- 		      gsi_remove (&si, false);
- 		      gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
- 						    stmt);
- 
- 		      if (dump_enabled_p ())
- 			{
- 			  dump_printf_loc
- 			      (MSG_NOTE, vect_location,
- 			       "hoisting out of the vectorized loop: ");
- 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
- 			  dump_printf (MSG_NOTE, "\n");
- 			}
- 		      continue;
- 		    }
- 		}
- 	      gsi_next (&si);
- 	    }
- 	}
-     }
- 
    /* End loop-exit-fixes after versioning.  */
  
    if (cond_expr_stmt_list)
--- 2435,2440 ----
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 206576)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_load (gimple stmt, gimple_s
*** 6368,6378 ****
  	      /* 4. Handle invariant-load.  */
  	      if (inv_p && !bb_vinfo)
  		{
- 		  gimple_stmt_iterator gsi2 = *gsi;
  		  gcc_assert (!grouped_load);
! 		  gsi_next (&gsi2);
! 		  new_temp = vect_init_vector (stmt, scalar_dest,
! 					       vectype, &gsi2);
  		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
  		}
  
--- 6368,6402 ----
  	      /* 4. Handle invariant-load.  */
  	      if (inv_p && !bb_vinfo)
  		{
  		  gcc_assert (!grouped_load);
! 		  /* If we have versioned for aliasing then we are sure
! 		     this is a loop invariant load and thus we can insert
! 		     it on the preheader edge.  */
! 		  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
! 		    {
! 		      if (dump_enabled_p ())
! 			{
! 			  dump_printf_loc (MSG_NOTE, vect_location,
! 					   "hoisting out of the vectorized "
! 					   "loop: ");
! 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
! 			  dump_printf (MSG_NOTE, "\n");
! 			}
! 		      tree tem = copy_ssa_name (scalar_dest, NULL);
! 		      gsi_insert_on_edge_immediate
! 			(loop_preheader_edge (loop),
! 			 gimple_build_assign (tem,
! 					      unshare_expr
! 					        (gimple_assign_rhs1 (stmt))));
! 		      new_temp = vect_init_vector (stmt, tem, vectype, NULL);
! 		    }
! 		  else
! 		    {
! 		      gimple_stmt_iterator gsi2 = *gsi;
! 		      gsi_next (&gsi2);
! 		      new_temp = vect_init_vector (stmt, scalar_dest,
! 						   vectype, &gsi2);
! 		    }
  		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
  		}
  
Index: gcc/testsuite/gcc.dg/torture/pr58921.c
===================================================================
*** gcc/testsuite/gcc.dg/torture/pr58921.c	(revision 0)
--- gcc/testsuite/gcc.dg/torture/pr58921.c	(working copy)
***************
*** 0 ****
--- 1,17 ----
+ /* { dg-do compile } */
+ 
+ int a[7];
+ int b;
+ 
+ void
+ fn1 ()
+ {
+   for (; b; b++)
+     a[b] = ((a[b] <= 0) == (a[0] != 0));
+ }
+ 
+ int
+ main ()
+ {
+   return 0;
+ }
Index: gcc/testsuite/gcc.dg/torture/pr59006.c
===================================================================
*** gcc/testsuite/gcc.dg/torture/pr59006.c	(revision 0)
--- gcc/testsuite/gcc.dg/torture/pr59006.c	(working copy)
***************
*** 0 ****
--- 1,13 ----
+ /* { dg-do compile } */
+ 
+ int a[8], b;
+ void fn1(void)
+ {
+   int c;
+   for (; b; b++)
+     {
+       int d = a[b];
+       c = a[0] ? d : 0;
+       a[b] = c;
+     }
+ }
Index: gcc/testsuite/gcc.dg/vect/pr58508.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/pr58508.c	(revision 206576)
--- gcc/testsuite/gcc.dg/vect/pr58508.c	(working copy)
*************** void test5 (int* a, int* b)
*** 66,70 ****
      }
  }
  
! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
--- 66,71 ----
      }
  }
  
! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail *-*-* } } } */
! /* { dg-final { scan-tree-dump-times "hoist" 3 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2014-01-13 13:38     ` Richard Biener
@ 2014-01-13 13:45       ` Jakub Jelinek
  2014-01-14  3:42       ` Cong Hou
  2014-01-16 14:19       ` H.J. Lu
  2 siblings, 0 replies; 15+ messages in thread
From: Jakub Jelinek @ 2014-01-13 13:45 UTC (permalink / raw)
  To: Richard Biener; +Cc: Cong Hou, GCC Patches

On Mon, Jan 13, 2014 at 02:37:38PM +0100, Richard Biener wrote:
> 2014-01-13  Richard Biener  <rguenther@suse.de>
> 
> 	PR tree-optimization/58921
> 	PR tree-optimization/59006
> 	* tree-vect-loop-manip.c (vect_loop_versioning): Remove code
> 	hoisting invariant stmts.
> 	* tree-vect-stmts.c (vectorizable_load): Insert the splat of
> 	invariant loads on the preheader edge if possible.
> 
> 	* gcc.dg/torture/pr58921.c: New testcase.
> 	* gcc.dg/torture/pr59006.c: Likewise.
> 	* gcc.dg/vect/pr58508.c: XFAIL no longer handled cases.

Looks good to me.  If you want, I can add another bool to loop_vinfo, which
would say if in the vectorized loop could be aliasing preventing the
hoisting (i.e. set to false always, unless the loop->simdlen > 0, when it
would be set if we would without loop->simdlen > 0 use versioning for alias
or punting, but loop->simdlen > 0 resulted in vectorization of the loop
anyway).  Then, as a follow-up we could use that predicate instead of
LOOP_REQUIRES_VERSIONING_FOR_ALIAS in vectorizable_load.

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2014-01-13 13:38     ` Richard Biener
  2014-01-13 13:45       ` Jakub Jelinek
@ 2014-01-14  3:42       ` Cong Hou
  2014-01-14  9:01         ` Richard Biener
  2014-01-16 14:19       ` H.J. Lu
  2 siblings, 1 reply; 15+ messages in thread
From: Cong Hou @ 2014-01-14  3:42 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, GCC Patches

I noticed that LIM could not hoist vector invariant, and that is why
my first implementation tries to hoist them all.

In addition, there are two disadvantages of hoisting invariant load +
lim method:

First, for some instructions the scalar version is faster than the
vector version, and in this case hoisting scalar instructions before
vectorization is better. Those instructions include data
packing/unpacking, integer multiplication with SSE2, etc..

Second, it may use more SIMD registers.

The following code shows a simple example:

char *a, *b, *c;
for (int i = 0; i < N; ++i)
  a[i] = b[0] * c[0] + a[i];

Vectorizing b[0]*c[0] is worse than loading the result of b[0]*c[0]
into a vector.


thanks,
Cong


On Mon, Jan 13, 2014 at 5:37 AM, Richard Biener <rguenther@suse.de> wrote:
> On Wed, 27 Nov 2013, Jakub Jelinek wrote:
>
>> On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote:
>> > Hmm.  I'm still thinking that we should handle this during the regular
>> > transform step.
>>
>> I wonder if it can't be done instead just in vectorizable_load,
>> if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is
>> invariant, just emit the (broadcasted) load not inside of the loop, but on
>> the loop preheader edge.
>
> So this implements this suggestion, XFAILing the no longer handled cases.
> For example we get
>
>   _94 = *b_8(D);
>   vect_cst_.18_95 = {_94, _94, _94, _94};
>   _99 = prolog_loop_adjusted_niters.9_132 * 4;
>   vectp_a.22_98 = a_6(D) + _99;
>   ivtmp.43_77 = (unsigned long) vectp_a.22_98;
>
>   <bb 13>:
>   # ivtmp.41_67 = PHI <ivtmp.41_70(3), 0(12)>
>   # ivtmp.43_71 = PHI <ivtmp.43_69(3), ivtmp.43_77(12)>
>   vect__10.19_97 = vect_cst_.18_95 + { 1, 1, 1, 1 };
>   _76 = (void *) ivtmp.43_71;
>   MEM[base: _76, offset: 0B] = vect__10.19_97;
>
> ...
>
> instead of having hoisted *b_8 + 1 as scalar computation.  Not sure
> why LIM doesn't hoist the vector variant later.
>
> vect__10.19_97 = vect_cst_.18_95 + vect_cst_.20_96;
>   invariant up to level 1, cost 1.
>
> ah, the cost thing.  Should be "improved" to see that hoisting
> reduces the number of live SSA names in the loop.
>
> Eventually lower_vector_ssa could optimize vector to scalar
> code again ... (ick).
>
> Bootstrap / regtest running on x86_64.
>
> Comments?
>
> Thanks,
> Richard.
>
> 2014-01-13  Richard Biener  <rguenther@suse.de>
>
>         PR tree-optimization/58921
>         PR tree-optimization/59006
>         * tree-vect-loop-manip.c (vect_loop_versioning): Remove code
>         hoisting invariant stmts.
>         * tree-vect-stmts.c (vectorizable_load): Insert the splat of
>         invariant loads on the preheader edge if possible.
>
>         * gcc.dg/torture/pr58921.c: New testcase.
>         * gcc.dg/torture/pr59006.c: Likewise.
>         * gcc.dg/vect/pr58508.c: XFAIL no longer handled cases.
>
> Index: gcc/tree-vect-loop-manip.c
> ===================================================================
> *** gcc/tree-vect-loop-manip.c  (revision 206576)
> --- gcc/tree-vect-loop-manip.c  (working copy)
> *************** vect_loop_versioning (loop_vec_info loop
> *** 2435,2507 ****
>         }
>       }
>
> -
> -   /* Extract load statements on memrefs with zero-stride accesses.  */
> -
> -   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> -     {
> -       /* In the loop body, we iterate each statement to check if it is a load.
> -        Then we check the DR_STEP of the data reference.  If DR_STEP is zero,
> -        then we will hoist the load statement to the loop preheader.  */
> -
> -       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
> -       int nbbs = loop->num_nodes;
> -
> -       for (int i = 0; i < nbbs; ++i)
> -       {
> -         for (gimple_stmt_iterator si = gsi_start_bb (bbs[i]);
> -              !gsi_end_p (si);)
> -           {
> -             gimple stmt = gsi_stmt (si);
> -             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> -             struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> -
> -             if (is_gimple_assign (stmt)
> -                 && (!dr
> -                     || (DR_IS_READ (dr) && integer_zerop (DR_STEP (dr)))))
> -               {
> -                 bool hoist = true;
> -                 ssa_op_iter iter;
> -                 tree var;
> -
> -                 /* We hoist a statement if all SSA uses in it are defined
> -                    outside of the loop.  */
> -                 FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
> -                   {
> -                     gimple def = SSA_NAME_DEF_STMT (var);
> -                     if (!gimple_nop_p (def)
> -                         && flow_bb_inside_loop_p (loop, gimple_bb (def)))
> -                       {
> -                         hoist = false;
> -                         break;
> -                       }
> -                   }
> -
> -                 if (hoist)
> -                   {
> -                     if (dr)
> -                       gimple_set_vuse (stmt, NULL);
> -
> -                     gsi_remove (&si, false);
> -                     gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
> -                                                   stmt);
> -
> -                     if (dump_enabled_p ())
> -                       {
> -                         dump_printf_loc
> -                             (MSG_NOTE, vect_location,
> -                              "hoisting out of the vectorized loop: ");
> -                         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
> -                         dump_printf (MSG_NOTE, "\n");
> -                       }
> -                     continue;
> -                   }
> -               }
> -             gsi_next (&si);
> -           }
> -       }
> -     }
> -
>     /* End loop-exit-fixes after versioning.  */
>
>     if (cond_expr_stmt_list)
> --- 2435,2440 ----
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> *** gcc/tree-vect-stmts.c       (revision 206576)
> --- gcc/tree-vect-stmts.c       (working copy)
> *************** vectorizable_load (gimple stmt, gimple_s
> *** 6368,6378 ****
>               /* 4. Handle invariant-load.  */
>               if (inv_p && !bb_vinfo)
>                 {
> -                 gimple_stmt_iterator gsi2 = *gsi;
>                   gcc_assert (!grouped_load);
> !                 gsi_next (&gsi2);
> !                 new_temp = vect_init_vector (stmt, scalar_dest,
> !                                              vectype, &gsi2);
>                   new_stmt = SSA_NAME_DEF_STMT (new_temp);
>                 }
>
> --- 6368,6402 ----
>               /* 4. Handle invariant-load.  */
>               if (inv_p && !bb_vinfo)
>                 {
>                   gcc_assert (!grouped_load);
> !                 /* If we have versioned for aliasing then we are sure
> !                    this is a loop invariant load and thus we can insert
> !                    it on the preheader edge.  */
> !                 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> !                   {
> !                     if (dump_enabled_p ())
> !                       {
> !                         dump_printf_loc (MSG_NOTE, vect_location,
> !                                          "hoisting out of the vectorized "
> !                                          "loop: ");
> !                         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
> !                         dump_printf (MSG_NOTE, "\n");
> !                       }
> !                     tree tem = copy_ssa_name (scalar_dest, NULL);
> !                     gsi_insert_on_edge_immediate
> !                       (loop_preheader_edge (loop),
> !                        gimple_build_assign (tem,
> !                                             unshare_expr
> !                                               (gimple_assign_rhs1 (stmt))));
> !                     new_temp = vect_init_vector (stmt, tem, vectype, NULL);
> !                   }
> !                 else
> !                   {
> !                     gimple_stmt_iterator gsi2 = *gsi;
> !                     gsi_next (&gsi2);
> !                     new_temp = vect_init_vector (stmt, scalar_dest,
> !                                                  vectype, &gsi2);
> !                   }
>                   new_stmt = SSA_NAME_DEF_STMT (new_temp);
>                 }
>
> Index: gcc/testsuite/gcc.dg/torture/pr58921.c
> ===================================================================
> *** gcc/testsuite/gcc.dg/torture/pr58921.c      (revision 0)
> --- gcc/testsuite/gcc.dg/torture/pr58921.c      (working copy)
> ***************
> *** 0 ****
> --- 1,17 ----
> + /* { dg-do compile } */
> +
> + int a[7];
> + int b;
> +
> + void
> + fn1 ()
> + {
> +   for (; b; b++)
> +     a[b] = ((a[b] <= 0) == (a[0] != 0));
> + }
> +
> + int
> + main ()
> + {
> +   return 0;
> + }
> Index: gcc/testsuite/gcc.dg/torture/pr59006.c
> ===================================================================
> *** gcc/testsuite/gcc.dg/torture/pr59006.c      (revision 0)
> --- gcc/testsuite/gcc.dg/torture/pr59006.c      (working copy)
> ***************
> *** 0 ****
> --- 1,13 ----
> + /* { dg-do compile } */
> +
> + int a[8], b;
> + void fn1(void)
> + {
> +   int c;
> +   for (; b; b++)
> +     {
> +       int d = a[b];
> +       c = a[0] ? d : 0;
> +       a[b] = c;
> +     }
> + }
> Index: gcc/testsuite/gcc.dg/vect/pr58508.c
> ===================================================================
> *** gcc/testsuite/gcc.dg/vect/pr58508.c (revision 206576)
> --- gcc/testsuite/gcc.dg/vect/pr58508.c (working copy)
> *************** void test5 (int* a, int* b)
> *** 66,70 ****
>       }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail vect_no_align } } } */
>   /* { dg-final { cleanup-tree-dump "vect" } } */
> --- 66,71 ----
>       }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail *-*-* } } } */
> ! /* { dg-final { scan-tree-dump-times "hoist" 3 "vect" { xfail vect_no_align } } } */
>   /* { dg-final { cleanup-tree-dump "vect" } } */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2014-01-14  3:42       ` Cong Hou
@ 2014-01-14  9:01         ` Richard Biener
  2014-01-14 10:51           ` Richard Biener
  2014-01-14 13:17           ` Jakub Jelinek
  0 siblings, 2 replies; 15+ messages in thread
From: Richard Biener @ 2014-01-14  9:01 UTC (permalink / raw)
  To: Cong Hou; +Cc: Jakub Jelinek, GCC Patches

On Mon, 13 Jan 2014, Cong Hou wrote:

> I noticed that LIM could not hoist vector invariant, and that is why
> my first implementation tries to hoist them all.

Yes, I filed PR59786 for this.  I'll see if I can come up with
a fix suitable for stage3.

> In addition, there are two disadvantages of hoisting invariant load +
> lim method:
> 
> First, for some instructions the scalar version is faster than the
> vector version, and in this case hoisting scalar instructions before
> vectorization is better. Those instructions include data
> packing/unpacking, integer multiplication with SSE2, etc..
> 
> Second, it may use more SIMD registers.
> 
> The following code shows a simple example:
> 
> char *a, *b, *c;
> for (int i = 0; i < N; ++i)
>   a[i] = b[0] * c[0] + a[i];
> 
> Vectorizing b[0]*c[0] is worse than loading the result of b[0]*c[0]
> into a vector.

Yes.  I've tried with adjusting the vec_def_type as in the prototype
patch I sent before christmas but that's quite intrusive for this
stage.  You could argue that performing invariant motion is not
really the vectorizers main task and that a combination of hoisting
only the load, later LIM hoisting the rest and then tree-vect-generic.c
demoting vector ops to scalar ops (unimplemented, but also a useful
general optimization) would work as well.

That said, we should definitely have a second look for 4.10.  For now
hoisting the load is an improvement over 4.8 (at least I hope so ;))
which needs to be good enough for 4.9.

I had to fix a latent bug to cure some testsuite fallout so the following
is what I ended up committing.

Jakub, adding the new flag is ok with me.

Thanks,
Richard.

2014-01-14  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/58921
	PR tree-optimization/59006
	* tree-vect-loop-manip.c (vect_loop_versioning): Remove code
	hoisting invariant stmts.
	* tree-vect-stmts.c (vectorizable_load): Insert the splat of
	invariant loads on the preheader edge if possible.

	* gcc.dg/torture/pr58921.c: New testcase.
	* gcc.dg/torture/pr59006.c: Likewise.
	* gcc.dg/vect/pr58508.c: XFAIL no longer handled cases.

Index: gcc/tree-vect-loop-manip.c
===================================================================
*** gcc/tree-vect-loop-manip.c	(revision 206576)
--- gcc/tree-vect-loop-manip.c	(working copy)
*************** vect_loop_versioning (loop_vec_info loop
*** 2435,2507 ****
  	}
      }
  
- 
-   /* Extract load statements on memrefs with zero-stride accesses.  */
- 
-   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
-     {
-       /* In the loop body, we iterate each statement to check if it is a load.
- 	 Then we check the DR_STEP of the data reference.  If DR_STEP is zero,
- 	 then we will hoist the load statement to the loop preheader.  */
- 
-       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
-       int nbbs = loop->num_nodes;
- 
-       for (int i = 0; i < nbbs; ++i)
- 	{
- 	  for (gimple_stmt_iterator si = gsi_start_bb (bbs[i]);
- 	       !gsi_end_p (si);)
- 	    {
- 	      gimple stmt = gsi_stmt (si);
- 	      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
- 	      struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- 
- 	      if (is_gimple_assign (stmt)
- 		  && (!dr
- 		      || (DR_IS_READ (dr) && integer_zerop (DR_STEP (dr)))))
- 		{
- 		  bool hoist = true;
- 		  ssa_op_iter iter;
- 		  tree var;
- 
- 		  /* We hoist a statement if all SSA uses in it are defined
- 		     outside of the loop.  */
- 		  FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
- 		    {
- 		      gimple def = SSA_NAME_DEF_STMT (var);
- 		      if (!gimple_nop_p (def)
- 			  && flow_bb_inside_loop_p (loop, gimple_bb (def)))
- 			{
- 			  hoist = false;
- 			  break;
- 			}
- 		    }
- 
- 		  if (hoist)
- 		    {
- 		      if (dr)
- 			gimple_set_vuse (stmt, NULL);
- 
- 		      gsi_remove (&si, false);
- 		      gsi_insert_on_edge_immediate (loop_preheader_edge (loop),
- 						    stmt);
- 
- 		      if (dump_enabled_p ())
- 			{
- 			  dump_printf_loc
- 			      (MSG_NOTE, vect_location,
- 			       "hoisting out of the vectorized loop: ");
- 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
- 			  dump_printf (MSG_NOTE, "\n");
- 			}
- 		      continue;
- 		    }
- 		}
- 	      gsi_next (&si);
- 	    }
- 	}
-     }
- 
    /* End loop-exit-fixes after versioning.  */
  
    if (cond_expr_stmt_list)
--- 2435,2440 ----
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 206576)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_load (gimple stmt, gimple_s
*** 6368,6379 ****
  	      /* 4. Handle invariant-load.  */
  	      if (inv_p && !bb_vinfo)
  		{
- 		  gimple_stmt_iterator gsi2 = *gsi;
  		  gcc_assert (!grouped_load);
! 		  gsi_next (&gsi2);
! 		  new_temp = vect_init_vector (stmt, scalar_dest,
! 					       vectype, &gsi2);
  		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
  		}
  
  	      if (negative)
--- 6368,6406 ----
  	      /* 4. Handle invariant-load.  */
  	      if (inv_p && !bb_vinfo)
  		{
  		  gcc_assert (!grouped_load);
! 		  /* If we have versioned for aliasing then we are sure
! 		     this is a loop invariant load and thus we can insert
! 		     it on the preheader edge.  */
! 		  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
! 		    {
! 		      if (dump_enabled_p ())
! 			{
! 			  dump_printf_loc (MSG_NOTE, vect_location,
! 					   "hoisting out of the vectorized "
! 					   "loop: ");
! 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
! 			  dump_printf (MSG_NOTE, "\n");
! 			}
! 		      tree tem = copy_ssa_name (scalar_dest, NULL);
! 		      gsi_insert_on_edge_immediate
! 			(loop_preheader_edge (loop),
! 			 gimple_build_assign (tem,
! 					      unshare_expr
! 					        (gimple_assign_rhs1 (stmt))));
! 		      new_temp = vect_init_vector (stmt, tem, vectype, NULL);
! 		    }
! 		  else
! 		    {
! 		      gimple_stmt_iterator gsi2 = *gsi;
! 		      gsi_next (&gsi2);
! 		      new_temp = vect_init_vector (stmt, scalar_dest,
! 						   vectype, &gsi2);
! 		    }
  		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
+ 		  set_vinfo_for_stmt (new_stmt,
+ 				      new_stmt_vec_info (new_stmt, loop_vinfo,
+ 							 bb_vinfo));
  		}
  
  	      if (negative)
Index: gcc/testsuite/gcc.dg/torture/pr58921.c
===================================================================
*** gcc/testsuite/gcc.dg/torture/pr58921.c	(revision 0)
--- gcc/testsuite/gcc.dg/torture/pr58921.c	(working copy)
***************
*** 0 ****
--- 1,17 ----
+ /* { dg-do compile } */
+ 
+ int a[7];
+ int b;
+ 
+ void
+ fn1 ()
+ {
+   for (; b; b++)
+     a[b] = ((a[b] <= 0) == (a[0] != 0));
+ }
+ 
+ int
+ main ()
+ {
+   return 0;
+ }
Index: gcc/testsuite/gcc.dg/torture/pr59006.c
===================================================================
*** gcc/testsuite/gcc.dg/torture/pr59006.c	(revision 0)
--- gcc/testsuite/gcc.dg/torture/pr59006.c	(working copy)
***************
*** 0 ****
--- 1,13 ----
+ /* { dg-do compile } */
+ 
+ int a[8], b;
+ void fn1(void)
+ {
+   int c;
+   for (; b; b++)
+     {
+       int d = a[b];
+       c = a[0] ? d : 0;
+       a[b] = c;
+     }
+ }
Index: gcc/testsuite/gcc.dg/vect/pr58508.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/pr58508.c	(revision 206576)
--- gcc/testsuite/gcc.dg/vect/pr58508.c	(working copy)
*************** void test5 (int* a, int* b)
*** 66,70 ****
      }
  }
  
! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
--- 66,71 ----
      }
  }
  
! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail *-*-* } } } */
! /* { dg-final { scan-tree-dump-times "hoist" 3 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2014-01-14  9:01         ` Richard Biener
@ 2014-01-14 10:51           ` Richard Biener
  2014-01-14 13:17           ` Jakub Jelinek
  1 sibling, 0 replies; 15+ messages in thread
From: Richard Biener @ 2014-01-14 10:51 UTC (permalink / raw)
  To: Cong Hou; +Cc: Jakub Jelinek, GCC Patches

On Tue, 14 Jan 2014, Richard Biener wrote:

> On Mon, 13 Jan 2014, Cong Hou wrote:
> 
> > I noticed that LIM could not hoist vector invariant, and that is why
> > my first implementation tries to hoist them all.
> 
> Yes, I filed PR59786 for this.  I'll see if I can come up with
> a fix suitable for stage3.
> 
> > In addition, there are two disadvantages of hoisting invariant load +
> > lim method:
> > 
> > First, for some instructions the scalar version is faster than the
> > vector version, and in this case hoisting scalar instructions before
> > vectorization is better. Those instructions include data
> > packing/unpacking, integer multiplication with SSE2, etc..
> > 
> > Second, it may use more SIMD registers.
> > 
> > The following code shows a simple example:
> > 
> > char *a, *b, *c;
> > for (int i = 0; i < N; ++i)
> >   a[i] = b[0] * c[0] + a[i];
> > 
> > Vectorizing b[0]*c[0] is worse than loading the result of b[0]*c[0]
> > into a vector.
> 
> Yes.  I've tried with adjusting the vec_def_type as in the prototype
> patch I sent before christmas but that's quite intrusive for this
> stage.  You could argue that performing invariant motion is not
> really the vectorizers main task and that a combination of hoisting
> only the load, later LIM hoisting the rest and then tree-vect-generic.c
> demoting vector ops to scalar ops (unimplemented, but also a useful
> general optimization) would work as well.

For example with the untested following.  Not sure if the LIM change
is appropriate at this stage (it's handling of "cost" is weird, and
in other places of the compiler we simply aggressively hoist
invariants and expect RTL to fixup register pressure issues).

The lowering change looks more like sth for forwprop but that
runs quite late after vectorization.  tree-vect-generic could
at least factor in whether the target has a scalar op of that
kind and whether that is maybe more expensive (though trading
two vector splats for one is very likely offsetting that).  It
also would need to consider the case where this moves a vector
splat inside a loop when handling the testcases we talk about
without improved invariant motion.

Any comments?  Anything we want to fix before 4.9?  The
testcases are optimized by RTL invariant motion but they
perform a vector addition.  For example

void test1 (int* a, int* b)
{
  int i;
  for (i = 0; i < 100000; ++i)
    a[i] = *b + 1;
}

gets

.L7:
        movd    (%rsi), %xmm1
        leaq    (%rdi,%rdx,4), %rdx
        xorl    %eax, %eax
        pshufd  $0, %xmm1, %xmm0
        paddd   .LC0(%rip), %xmm0
        .p2align 4,,10
        .p2align 3
.L4:
        addl    $1, %eax
        addq    $16, %rdx
        movaps  %xmm0, -16(%rdx)
        cmpl    %eax, %ecx
        ja      .L4

instead of

.L7:
        movl    (%rsi), %eax
        leaq    (%rdi,%rdx,4), %rdx
        addl    $1, %eax
        movl    %eax, -12(%rsp)
        xorl    %eax, %eax
        movd    -12(%rsp), %xmm1
        pshufd  $0, %xmm1, %xmm0
        .p2align 4,,10
        .p2align 3
.L4:
        addl    $1, %eax
        addq    $16, %rdx
        movaps  %xmm0, -16(%rdx)
        cmpl    %eax, %ecx
        ja      .L4

which because of the by default disabled inter-unit moves looks
even more expensive.  With inter-unit moves we get

.L7:
        movl    (%rsi), %eax
        leaq    (%rdi,%rdx,4), %rdx
        addl    $1, %eax
        movd    %eax, %xmm0
        xorl    %eax, %eax
        pshufd  $0, %xmm0, %xmm0
        .p2align 4,,10
        .p2align 3
.L4:
        addl    $1, %eax
        movaps  %xmm0, (%rdx)
        addq    $16, %rdx
        cmpl    %eax, %ecx
        ja      .L4

not sure if the avoided constant pool load offsets the inter-unit
move here (depends on the kind of pipeline constraints that has,
the above is with corei7 tuning).

It looks to me that demoting vector to scalar ops might be
better performed at RTL level?  Plus the reverse op as seen
from the above example where it isn't all clear which
variant is better (which probably depends quite some bit
on the CPU architecture).

Thanks,
Richard.

Index: gcc/tree-ssa-loop-im.c
===================================================================
*** gcc/tree-ssa-loop-im.c	(revision 206599)
--- gcc/tree-ssa-loop-im.c	(working copy)
*************** stmt_cost (gimple stmt)
*** 533,538 ****
--- 533,541 ----
        return 0;
  
      default:
+       /* All vector operations are expensive.  */
+       if (VECTOR_TYPE_P (gimple_expr_type (stmt)))
+ 	return LIM_EXPENSIVE;
        return 1;
      }
  }
Index: gcc/tree-vect-generic.c
===================================================================
*** gcc/tree-vect-generic.c	(revision 206599)
--- gcc/tree-vect-generic.c	(working copy)
*************** lower_vec_perm (gimple_stmt_iterator *gs
*** 1335,1340 ****
--- 1335,1357 ----
    update_stmt (gsi_stmt (*gsi));
  }
  
+ /* If OP is a uniform vector return the element it is a splat from.  */
+ 
+ static tree
+ ssa_uniform_vector_p (tree op)
+ {
+   if (TREE_CODE (op) == VECTOR_CST
+       || TREE_CODE (op) == CONSTRUCTOR)
+     return uniform_vector_p (op);
+   if (TREE_CODE (op) == SSA_NAME)
+     {
+       gimple def_stmt = SSA_NAME_DEF_STMT (op);
+       if (gimple_assign_single_p (def_stmt))
+ 	return uniform_vector_p (gimple_assign_rhs1 (def_stmt));
+     }
+   return NULL_TREE;
+ }
+ 
  /* Process one statement.  If we identify a vector operation, expand it.  */
  
  static void
*************** expand_vector_operations_1 (gimple_stmt_
*** 1388,1393 ****
--- 1405,1430 ----
    if (TREE_CODE (type) != VECTOR_TYPE)
      return;
  
+   if (code == NEGATE_EXPR
+       || code == PLUS_EXPR
+       || code == MINUS_EXPR
+       || code == MULT_EXPR)
+     {
+       tree srhs1, srhs2 = NULL_TREE;
+       if ((srhs1 = ssa_uniform_vector_p (rhs1)) != NULL_TREE
+ 	  && (rhs2 == NULL_TREE
+ 	      || (srhs2 = ssa_uniform_vector_p (rhs2)) != NULL_TREE))
+ 	{
+ 	  tree slhs = make_ssa_name (TREE_TYPE (srhs1), NULL);
+ 	  gimple repl = gimple_build_assign_with_ops (code, slhs, srhs1, srhs2);
+ 	  gsi_insert_before (gsi, repl, GSI_SAME_STMT);
+ 	  gimple_assign_set_rhs_from_tree (gsi,
+ 					   build_vector_from_val (type, slhs));
+ 	  update_stmt (stmt);
+ 	  return;
+ 	}
+     }
+ 
    if (code == NOP_EXPR
        || code == FLOAT_EXPR
        || code == FIX_TRUNC_EXPR
*************** expand_vector_operations_1 (gimple_stmt_
*** 1433,1447 ****
        if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
          {
            tree first;
!           gimple def_stmt;
! 
!           if ((TREE_CODE (rhs2) == VECTOR_CST
! 	       && (first = uniform_vector_p (rhs2)) != NULL_TREE)
! 	      || (TREE_CODE (rhs2) == SSA_NAME
! 		  && (def_stmt = SSA_NAME_DEF_STMT (rhs2))
! 		  && gimple_assign_single_p (def_stmt)
! 		  && (first = uniform_vector_p
! 		      (gimple_assign_rhs1 (def_stmt))) != NULL_TREE))
              {
                gimple_assign_set_rhs2 (stmt, first);
                update_stmt (stmt);
--- 1470,1476 ----
        if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
          {
            tree first;
!           if ((first = ssa_uniform_vector_p (rhs2)) != NULL_TREE)
              {
                gimple_assign_set_rhs2 (stmt, first);
                update_stmt (stmt);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2014-01-14  9:01         ` Richard Biener
  2014-01-14 10:51           ` Richard Biener
@ 2014-01-14 13:17           ` Jakub Jelinek
  2014-01-14 13:37             ` Richard Biener
  1 sibling, 1 reply; 15+ messages in thread
From: Jakub Jelinek @ 2014-01-14 13:17 UTC (permalink / raw)
  To: Richard Biener; +Cc: Cong Hou, GCC Patches

On Tue, Jan 14, 2014 at 10:01:06AM +0100, Richard Biener wrote:
> Jakub, adding the new flag is ok with me.

So like this?

2014-01-14  Jakub Jelinek  <jakub@redhat.com>

	* tree-vectorizer.h (struct _loop_vec_info): Add no_data_dependencies
	field.
	(LOOP_VINFO_NO_DATA_DEPENDENCIES): Define.
	* tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Clear it
	when not giving up or versioning for alias only because of
	loop->safelen.
	(vect_analyze_data_ref_dependences): Set to true.
	* tree-vect-stmts.c (vectorizable_load): Use
	LOOP_VINFO_NO_DATA_DEPENDENCIES instead of
	LOOP_REQUIRES_VERSIONING_FOR_ALIAS.

--- gcc/tree-vectorizer.h.jj	2014-01-03 11:40:57.000000000 +0100
+++ gcc/tree-vectorizer.h	2014-01-14 13:10:00.477989924 +0100
@@ -347,6 +347,25 @@ typedef struct _loop_vec_info {
      fix it up.  */
   bool operands_swapped;
 
+  /* True if there are no loop carried data dependencies in the loop.
+     If loop->safelen <= 1, then this is always true, either the loop
+     didn't have any loop carried data dependencies, or the loop is being
+     vectorized guarded with some runtime alias checks, or couldn't
+     be vectorized at all, but then this field shouldn't be used.
+     For loop->safelen >= 2, the user has asserted that there are no
+     backward dependencies, but there still could be loop carried forward
+     dependencies in such loops.  This flag will be false if normal
+     vectorizer data dependency analysis would fail or require versioning
+     for alias, but because of loop->safelen >= 2 it has been vectorized
+     even without versioning for alias.  E.g. in:
+     #pragma omp simd
+     for (int i = 0; i < m; i++)
+       a[i] = a[i + k] * c;
+     (or #pragma simd or #pragma ivdep) we can vectorize this and it will
+     DTRT even for k > 0 && k < m, but without safelen we would not
+     vectorize this, so this field would be false.  */
+  bool no_data_dependencies;
+
   /* If if-conversion versioned this loop before conversion, this is the
      loop version without if-conversion.  */
   struct loop *scalar_loop;
@@ -385,6 +404,7 @@ typedef struct _loop_vec_info {
 #define LOOP_VINFO_PEELING_FOR_GAPS(L)     (L)->peeling_for_gaps
 #define LOOP_VINFO_OPERANDS_SWAPPED(L)     (L)->operands_swapped
 #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
+#define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
--- gcc/tree-vect-data-refs.c.jj	2014-01-10 00:38:26.000000000 +0100
+++ gcc/tree-vect-data-refs.c	2014-01-14 13:12:06.056342116 +0100
@@ -244,6 +244,7 @@ vect_analyze_data_ref_dependence (struct
 	{
 	  if (loop->safelen < *max_vf)
 	    *max_vf = loop->safelen;
+	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 	  return false;
 	}
 
@@ -291,6 +292,7 @@ vect_analyze_data_ref_dependence (struct
 	{
 	  if (loop->safelen < *max_vf)
 	    *max_vf = loop->safelen;
+	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 	  return false;
 	}
 
@@ -447,6 +449,7 @@ vect_analyze_data_ref_dependences (loop_
     dump_printf_loc (MSG_NOTE, vect_location,
                      "=== vect_analyze_data_ref_dependences ===\n");
 
+  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 				&LOOP_VINFO_DDRS (loop_vinfo),
 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
--- gcc/tree-vect-stmts.c.jj	2014-01-14 10:33:21.000000000 +0100
+++ gcc/tree-vect-stmts.c	2014-01-14 13:14:15.157677243 +0100
@@ -6381,10 +6381,11 @@ vectorizable_load (gimple stmt, gimple_s
 	      if (inv_p && !bb_vinfo)
 		{
 		  gcc_assert (!grouped_load);
-		  /* If we have versioned for aliasing then we are sure
-		     this is a loop invariant load and thus we can insert
-		     it on the preheader edge.  */
-		  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+		  /* If we have versioned for aliasing or the loop doesn't
+		     have any data dependencies that would preclude this,
+		     then we are sure this is a loop invariant load and
+		     thus we can insert it on the preheader edge.  */
+		  if (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo))
 		    {
 		      if (dump_enabled_p ())
 			{


	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2014-01-14 13:17           ` Jakub Jelinek
@ 2014-01-14 13:37             ` Richard Biener
  0 siblings, 0 replies; 15+ messages in thread
From: Richard Biener @ 2014-01-14 13:37 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Cong Hou, GCC Patches

On Tue, 14 Jan 2014, Jakub Jelinek wrote:

> On Tue, Jan 14, 2014 at 10:01:06AM +0100, Richard Biener wrote:
> > Jakub, adding the new flag is ok with me.
> 
> So like this?

Ok if it passes testing.

Thanks,
Richard.

> 2014-01-14  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* tree-vectorizer.h (struct _loop_vec_info): Add no_data_dependencies
> 	field.
> 	(LOOP_VINFO_NO_DATA_DEPENDENCIES): Define.
> 	* tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Clear it
> 	when not giving up or versioning for alias only because of
> 	loop->safelen.
> 	(vect_analyze_data_ref_dependences): Set to true.
> 	* tree-vect-stmts.c (vectorizable_load): Use
> 	LOOP_VINFO_NO_DATA_DEPENDENCIES instead of
> 	LOOP_REQUIRES_VERSIONING_FOR_ALIAS.
> 
> --- gcc/tree-vectorizer.h.jj	2014-01-03 11:40:57.000000000 +0100
> +++ gcc/tree-vectorizer.h	2014-01-14 13:10:00.477989924 +0100
> @@ -347,6 +347,25 @@ typedef struct _loop_vec_info {
>       fix it up.  */
>    bool operands_swapped;
>  
> +  /* True if there are no loop carried data dependencies in the loop.
> +     If loop->safelen <= 1, then this is always true, either the loop
> +     didn't have any loop carried data dependencies, or the loop is being
> +     vectorized guarded with some runtime alias checks, or couldn't
> +     be vectorized at all, but then this field shouldn't be used.
> +     For loop->safelen >= 2, the user has asserted that there are no
> +     backward dependencies, but there still could be loop carried forward
> +     dependencies in such loops.  This flag will be false if normal
> +     vectorizer data dependency analysis would fail or require versioning
> +     for alias, but because of loop->safelen >= 2 it has been vectorized
> +     even without versioning for alias.  E.g. in:
> +     #pragma omp simd
> +     for (int i = 0; i < m; i++)
> +       a[i] = a[i + k] * c;
> +     (or #pragma simd or #pragma ivdep) we can vectorize this and it will
> +     DTRT even for k > 0 && k < m, but without safelen we would not
> +     vectorize this, so this field would be false.  */
> +  bool no_data_dependencies;
> +
>    /* If if-conversion versioned this loop before conversion, this is the
>       loop version without if-conversion.  */
>    struct loop *scalar_loop;
> @@ -385,6 +404,7 @@ typedef struct _loop_vec_info {
>  #define LOOP_VINFO_PEELING_FOR_GAPS(L)     (L)->peeling_for_gaps
>  #define LOOP_VINFO_OPERANDS_SWAPPED(L)     (L)->operands_swapped
>  #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
> +#define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
>  #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
>  
>  #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
> --- gcc/tree-vect-data-refs.c.jj	2014-01-10 00:38:26.000000000 +0100
> +++ gcc/tree-vect-data-refs.c	2014-01-14 13:12:06.056342116 +0100
> @@ -244,6 +244,7 @@ vect_analyze_data_ref_dependence (struct
>  	{
>  	  if (loop->safelen < *max_vf)
>  	    *max_vf = loop->safelen;
> +	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
>  	  return false;
>  	}
>  
> @@ -291,6 +292,7 @@ vect_analyze_data_ref_dependence (struct
>  	{
>  	  if (loop->safelen < *max_vf)
>  	    *max_vf = loop->safelen;
> +	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
>  	  return false;
>  	}
>  
> @@ -447,6 +449,7 @@ vect_analyze_data_ref_dependences (loop_
>      dump_printf_loc (MSG_NOTE, vect_location,
>                       "=== vect_analyze_data_ref_dependences ===\n");
>  
> +  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
>    if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
>  				&LOOP_VINFO_DDRS (loop_vinfo),
>  				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
> --- gcc/tree-vect-stmts.c.jj	2014-01-14 10:33:21.000000000 +0100
> +++ gcc/tree-vect-stmts.c	2014-01-14 13:14:15.157677243 +0100
> @@ -6381,10 +6381,11 @@ vectorizable_load (gimple stmt, gimple_s
>  	      if (inv_p && !bb_vinfo)
>  		{
>  		  gcc_assert (!grouped_load);
> -		  /* If we have versioned for aliasing then we are sure
> -		     this is a loop invariant load and thus we can insert
> -		     it on the preheader edge.  */
> -		  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> +		  /* If we have versioned for aliasing or the loop doesn't
> +		     have any data dependencies that would preclude this,
> +		     then we are sure this is a loop invariant load and
> +		     thus we can insert it on the preheader edge.  */
> +		  if (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo))
>  		    {
>  		      if (dump_enabled_p ())
>  			{
> 
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.
  2014-01-13 13:38     ` Richard Biener
  2014-01-13 13:45       ` Jakub Jelinek
  2014-01-14  3:42       ` Cong Hou
@ 2014-01-16 14:19       ` H.J. Lu
  2 siblings, 0 replies; 15+ messages in thread
From: H.J. Lu @ 2014-01-16 14:19 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, Cong Hou, GCC Patches

On Mon, Jan 13, 2014 at 5:37 AM, Richard Biener <rguenther@suse.de> wrote:
> On Wed, 27 Nov 2013, Jakub Jelinek wrote:
>
>> On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote:
>> > Hmm.  I'm still thinking that we should handle this during the regular
>> > transform step.
>>
>> I wonder if it can't be done instead just in vectorizable_load,
>> if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is
>> invariant, just emit the (broadcasted) load not inside of the loop, but on
>> the loop preheader edge.
>
> So this implements this suggestion, XFAILing the no longer handled cases.
> For example we get
>
>   _94 = *b_8(D);
>   vect_cst_.18_95 = {_94, _94, _94, _94};
>   _99 = prolog_loop_adjusted_niters.9_132 * 4;
>   vectp_a.22_98 = a_6(D) + _99;
>   ivtmp.43_77 = (unsigned long) vectp_a.22_98;
>
>   <bb 13>:
>   # ivtmp.41_67 = PHI <ivtmp.41_70(3), 0(12)>
>   # ivtmp.43_71 = PHI <ivtmp.43_69(3), ivtmp.43_77(12)>
>   vect__10.19_97 = vect_cst_.18_95 + { 1, 1, 1, 1 };
>   _76 = (void *) ivtmp.43_71;
>   MEM[base: _76, offset: 0B] = vect__10.19_97;
>
> ...
>
> instead of having hoisted *b_8 + 1 as scalar computation.  Not sure
> why LIM doesn't hoist the vector variant later.
>
> vect__10.19_97 = vect_cst_.18_95 + vect_cst_.20_96;
>   invariant up to level 1, cost 1.
>
> ah, the cost thing.  Should be "improved" to see that hoisting
> reduces the number of live SSA names in the loop.
>
> Eventually lower_vector_ssa could optimize vector to scalar
> code again ... (ick).
>
> Bootstrap / regtest running on x86_64.
>
> Comments?
>
> Thanks,
> Richard.
>
> 2014-01-13  Richard Biener  <rguenther@suse.de>
>
>         PR tree-optimization/58921
>         PR tree-optimization/59006
>         * tree-vect-loop-manip.c (vect_loop_versioning): Remove code
>         hoisting invariant stmts.
>         * tree-vect-stmts.c (vectorizable_load): Insert the splat of
>         invariant loads on the preheader edge if possible.
>

This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59841

H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2014-01-16 14:19 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-23 14:13 [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer Cong Hou
2013-11-27 11:37 ` Richard Biener
2013-11-27 12:45   ` Jakub Jelinek
2013-11-27 14:26     ` Richard Biener
2013-11-27 15:35       ` Jakub Jelinek
2013-11-27 16:08         ` Richard Biener
2014-01-13 13:38     ` Richard Biener
2014-01-13 13:45       ` Jakub Jelinek
2014-01-14  3:42       ` Cong Hou
2014-01-14  9:01         ` Richard Biener
2014-01-14 10:51           ` Richard Biener
2014-01-14 13:17           ` Jakub Jelinek
2014-01-14 13:37             ` Richard Biener
2014-01-16 14:19       ` H.J. Lu
2013-11-28  4:39   ` Cong Hou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).