public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Simple optimization for MASK_STORE.
@ 2015-05-06 14:04 Yuri Rumyantsev
  2015-05-08  9:27 ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-05-06 14:04 UTC (permalink / raw)
  To: gcc-patches, Igor Zamyatin, Richard Biener

[-- Attachment #1: Type: text/plain, Size: 1193 bytes --]

Hi All,

Here is a patch which gives us significant speed-up on HASWELL for
test containing masked stores. The main goal of that patch is attempt
to avoid HW hazard for maskmove instructions through inserting
additional check on zero mask and putting all masked store statements
into separate block on false edge.All MASK_STORE statements having the
same mask put into one block. Any comments will be appreciate.

ChangeLog:
2015-05-06  Yuri Rumyantsev  <ysrumyan@gmail.com>

* cfgloop.h (has_mask_store): Add new field to struct loop.
* config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
(ix86_vectorize_zero_vector): New function.
(TARGET_VECTORIZE_ZERO_VECTOR): New target macro
* doc/tm.texi.in: Add @hook TARGET_VECTORIZE_ZERO_VECTOR.
* doc/tm.texi: Updated.
* target.def (zero_vector): New DEFHOOK.
* tree-if-conv.c (predicate_mem_writes): Set has_mask_store for loop.
* tree-vect-stmts.c : Include tree-into-ssa.h.
(optimize_mask_stores): New function.
* tree-vectorizer.c (vectorize_loops): Zero has_mask_store field for
non-vectorized loops and invoke optimize_mask_stores function.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

[-- Attachment #2: patch.1 --]
[-- Type: application/octet-stream, Size: 10647 bytes --]

Index: cfgloop.h
===================================================================
--- cfgloop.h	(revision 222288)
+++ cfgloop.h	(working copy)
@@ -195,6 +195,9 @@
   /* True if we should try harder to vectorize this loop.  */
   bool force_vectorize;
 
+  /* True if this loop contains masked stores.  */
+  bool has_mask_store;
+
   /* For SIMD loops, this is a unique identifier of the loop, referenced
      by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE
      builtins.  */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 222288)
+++ config/i386/i386.c	(working copy)
@@ -115,6 +115,8 @@
 #include "tree-iterator.h"
 #include "tree-chkp.h"
 #include "rtl-chkp.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
 
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -40988,6 +40990,47 @@
   return ix86_get_builtin (code);
 }
 
+/* Returns true if given vector type is supported by builtin ptest.
+   NAME is lhs of created ptest call. All created statements are added
+   to GS.  */
+
+static bool
+ix86_vectorize_zero_vector (tree source, tree name, gimple_seq *gs)
+{
+  tree type = TREE_TYPE (source);
+  gimple stmt;
+  enum ix86_builtins code;
+  tree decl, new_type, conv_expr, vec_tmp; 
+
+  gcc_assert (VECTOR_TYPE_P (type));
+  if (!TARGET_AVX)
+    return false;
+
+  switch (tree_to_uhwi (TYPE_SIZE (type)))
+    {
+    case 128:
+      code = IX86_BUILTIN_PTESTZ;
+      break;
+    case 256:
+      if (!TARGET_AVX2)
+	return false;
+      code = IX86_BUILTIN_PTESTZ256;
+      break; 
+    default:
+      return false;
+    }
+  decl = ix86_builtin_decl (code, true);
+  new_type = get_same_sized_vectype (long_long_integer_type_node, type);
+  conv_expr = build1 (VIEW_CONVERT_EXPR, new_type, source);
+  vec_tmp = make_ssa_name (new_type);
+  stmt = gimple_build_assign (vec_tmp, conv_expr);
+  gimple_seq_add_stmt (gs, stmt);
+  stmt = gimple_build_call (decl, 2, vec_tmp, vec_tmp);
+  gimple_call_set_lhs (stmt, name);
+  gimple_seq_add_stmt (gs, stmt);
+  return true;
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -51765,6 +51808,9 @@
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_ZERO_VECTOR
+#define TARGET_VECTORIZE_ZERO_VECTOR ix86_vectorize_zero_vector
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in	(revision 222288)
+++ doc/tm.texi.in	(working copy)
@@ -4241,6 +4241,8 @@
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
+@hook TARGET_VECTORIZE_ZERO_VECTOR
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
Index: target.def
===================================================================
--- target.def	(revision 222288)
+++ target.def	(working copy)
@@ -1801,6 +1801,19 @@
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Returns true if target support zero vector predicate for given vector
+   type.  */
+DEFHOOK
+(zero_vector,
+ "This hook should return boolean true if target supports zero vector\n\
+prdicate.  @var{source} is the compared vector, @var{name} is ssa_name\n\
+containing boolean value true if all vector elements are zero and produced\n\
+statements are saved in @var{gs}.\n\
+The default is @code{false} which means that target does not support it.",
+ bool,
+ (tree source, tree name, gimple_seq *gs),
+ hook_bool_rtx_false)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
Index: testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
===================================================================
--- testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c	(revision 0)
+++ testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c	(working copy)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-mavx2 -O3 -fopenmp-simd -fdump-tree-vect-details" } */
+
+#define N 128
+extern int c[N];
+extern float a1[N], a2[N];
+
+void foo()
+{
+  int i;
+  for (i=0; i<N; i++)
+    if (c[i])
+      a1[i] += a2[i];
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */

Index: tree-if-conv.c
===================================================================
--- tree-if-conv.c	(revision 222288)
+++ tree-if-conv.c	(working copy)
@@ -2172,9 +2172,12 @@
 		gimple_call_set_lhs (new_stmt, lhs);
 	      }
 	    else
-	      new_stmt
-		= gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
-					      mask, rhs);
+	      {
+		new_stmt
+		  = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+						mask, rhs);
+		loop->has_mask_store = true;		
+	      }
 	    gsi_replace (&gsi, new_stmt, true);
 	  }
 	else if (gimple_vdef (stmt))
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 222288)
+++ tree-vect-stmts.c	(working copy)
@@ -88,6 +88,7 @@
 #include "ipa-ref.h"
 #include "cgraph.h"
 #include "builtins.h"
+#include "tree-into-ssa.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -8205,3 +8206,129 @@
   interm_types->release ();
   return false;
 }
+
+/* The code below is trying to remove HW hazzard related to masked stores:
+   the address of masked store can be illegal if a mask is zero.
+   It put all masked stores statements with the same mask into the new bb
+   with a check on zero mask.  */
+
+void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  auto_vec<gimple> worklist;
+
+  if (loop->dont_vectorize || !loop->has_mask_store
+      || loop->num_nodes > 2)
+    return;
+
+  /* This flag won't be used anymore.  */
+  loop->has_mask_store = false;
+
+  /* Pick up all masked stores in loop if any.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	worklist.safe_push (stmt);
+    }
+
+  if (worklist.is_empty ())
+    return;
+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple last, def_stmt;
+      edge e, efalse;
+      tree mask, val, vdef;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      gimple_seq gs;
+      tree arg3;
+
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
+      /* Loop was not vectorized if mask does not have vector type.  */
+      if (!VECTOR_TYPE_P (TREE_TYPE (mask)))
+	return;
+      val = make_ssa_name (integer_type_node);
+      gs = NULL;
+      /* Skip statemnt with unsupported check on zero mask.  */
+
+      if (!targetm.vectorize.zero_vector (mask, val, &gs))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf (MSG_NOTE, "Target does not support ptest!\n");
+	  continue;
+	}
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      store_bb->frequency = bb->frequency / 2;
+      efalse->probability = REG_BR_PROB_BASE / 2;
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
+      if (dom_info_available_p (CDI_DOMINATORS))
+	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
+
+      /* Create zero test condition.  */
+      stmt = gimple_build_cond (NE_EXPR, val, integer_zero_node,
+				NULL_TREE, NULL_TREE);
+      gcc_assert (gs != NULL);
+      gimple_seq_add_stmt (&gs, stmt);
+      gsi = gsi_last_bb (bb);
+      gsi_insert_seq_after (&gsi, gs, GSI_SAME_STMT);
+
+      /* Put all masked stores with the same mask to STORE_BB.  */
+      while (true)
+	{
+	  vdef = gimple_vdef (last);
+	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
+	    mark_virtual_operand_for_renaming (vdef);
+
+	  /* Move masked store to STORE_BB.  */
+	  gsi = gsi_for_stmt (last);
+	  gsi_to = gsi_start_bb (store_bb);
+	  gsi_move_before (&gsi, &gsi_to);
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf (MSG_NOTE, "Move MASK_STORE to bb#%d\n",
+			   store_bb->index);
+	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);	  
+	    }
+	  /* Put definition statement of stored value in STORE_BB
+	     if possible.  */
+	  arg3 = gimple_call_arg (last, 3);
+	  if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+	    {
+	      def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      /* Move def_stmt to STORE_BB if it is in the same bb.  */
+	      if (gimple_bb (def_stmt) == bb)
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf (MSG_NOTE, "Move stmt to bb#%d\n",
+				   store_bb->index);
+		      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);	  
+		    }
+		  gsi = gsi_for_stmt (def_stmt);
+		  gsi_to = gsi_start_bb (store_bb);
+		  gsi_move_before (&gsi, &gsi_to);
+		} 	  
+	    }
+	    /* Put other masked stores with the same mask to STORE_BB.  */
+	    if (worklist.is_empty ()
+		|| gimple_call_arg (worklist.last (), 2) != mask)
+	      break;
+	    last = worklist.pop ();
+	}
+    }
+}
Index: tree-vectorizer.c
===================================================================
--- tree-vectorizer.c	(revision 222288)
+++ tree-vectorizer.c	(working copy)
@@ -454,7 +454,10 @@
 	loop->aux = loop_vinfo;
 
 	if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
-	  continue;
+	  {
+	    loop->has_mask_store = false;
+	    continue;
+	  }
 
         if (!dbg_cnt (vect_loop))
 	  break;
@@ -556,6 +559,7 @@
       loop_vinfo = (loop_vec_info) loop->aux;
       destroy_loop_vec_info (loop_vinfo, true);
       loop->aux = NULL;
+      optimize_mask_stores (loop);
     }
 
   free_stmt_vec_info_vec ();
Index: tree-vectorizer.h
===================================================================
--- tree-vectorizer.h	(revision 222288)
+++ tree-vectorizer.h	(working copy)
@@ -1084,6 +1084,7 @@
 extern tree vect_create_addr_base_for_vector_ref (gimple, gimple_seq *,
 						  tree, struct loop *,
 						  tree = NULL_TREE);
+extern void optimize_mask_stores (struct loop *);
 
 /* In tree-vect-loop.c.  */
 /* FORNOW: Used in tree-parloops.c.  */

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2015-11-19 15:20 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-06 14:04 [PATCH] Simple optimization for MASK_STORE Yuri Rumyantsev
2015-05-08  9:27 ` Richard Biener
2015-05-08 18:43   ` Jeff Law
2015-05-08 19:16     ` Richard Biener
2015-05-20 14:10   ` Yuri Rumyantsev
2015-05-29 14:28     ` Yuri Rumyantsev
2015-06-09 12:15     ` Richard Biener
2015-06-18 15:41       ` Yuri Rumyantsev
2015-07-07 13:55         ` Yuri Rumyantsev
2015-07-10  5:51         ` Jeff Law
2015-07-20 15:26           ` Yuri Rumyantsev
2015-07-21 13:59             ` Richard Biener
2015-07-23 20:32             ` Jeff Law
2015-07-24  9:04               ` Yuri Rumyantsev
2015-07-24  9:24               ` Richard Biener
2015-07-24 19:26                 ` Jeff Law
2015-07-27  9:04                   ` Richard Biener
2015-08-06 11:07                     ` Yuri Rumyantsev
2015-08-13 11:40                       ` Yuri Rumyantsev
2015-08-13 11:46                         ` Richard Biener
2015-11-02 15:24                           ` Yuri Rumyantsev
2015-11-05 15:49                             ` Yuri Rumyantsev
2015-11-06 12:56                             ` Richard Biener
2015-11-06 13:29                               ` Yuri Rumyantsev
2015-11-10 12:33                                 ` Richard Biener
2015-11-10 12:48                                   ` Ilya Enkovich
2015-11-10 14:46                                     ` Richard Biener
2015-11-10 14:56                                       ` Ilya Enkovich
2015-11-10 17:02                                         ` Mike Stump
2015-11-11  9:18                                         ` Richard Biener
2015-11-11 13:13                                           ` Yuri Rumyantsev
2015-11-12 13:59                                             ` Richard Biener
2015-11-19 15:20                                               ` Yuri Rumyantsev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).