public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Simple optimization for MASK_STORE.
@ 2015-05-06 14:04 Yuri Rumyantsev
  2015-05-08  9:27 ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-05-06 14:04 UTC (permalink / raw)
  To: gcc-patches, Igor Zamyatin, Richard Biener

[-- Attachment #1: Type: text/plain, Size: 1193 bytes --]

Hi All,

Here is a patch which gives us significant speed-up on HASWELL for
test containing masked stores. The main goal of that patch is attempt
to avoid HW hazard for maskmove instructions through inserting
additional check on zero mask and putting all masked store statements
into separate block on false edge.All MASK_STORE statements having the
same mask put into one block. Any comments will be appreciate.

ChangeLog:
2015-05-06  Yuri Rumyantsev  <ysrumyan@gmail.com>

* cfgloop.h (has_mask_store): Add new field to struct loop.
* config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
(ix86_vectorize_zero_vector): New function.
(TARGET_VECTORIZE_ZERO_VECTOR): New target macro
* doc/tm.texi.in: Add @hook TARGET_VECTORIZE_ZERO_VECTOR.
* doc/tm.texi: Updated.
* target.def (zero_vector): New DEFHOOK.
* tree-if-conv.c (predicate_mem_writes): Set has_mask_store for loop.
* tree-vect-stmts.c : Include tree-into-ssa.h.
(optimize_mask_stores): New function.
* tree-vectorizer.c (vectorize_loops): Zero has_mask_store field for
non-vectorized loops and invoke optimize_mask_stores function.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

[-- Attachment #2: patch.1 --]
[-- Type: application/octet-stream, Size: 10647 bytes --]

Index: cfgloop.h
===================================================================
--- cfgloop.h	(revision 222288)
+++ cfgloop.h	(working copy)
@@ -195,6 +195,9 @@
   /* True if we should try harder to vectorize this loop.  */
   bool force_vectorize;
 
+  /* True if this loop contains masked stores.  */
+  bool has_mask_store;
+
   /* For SIMD loops, this is a unique identifier of the loop, referenced
      by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE
      builtins.  */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 222288)
+++ config/i386/i386.c	(working copy)
@@ -115,6 +115,8 @@
 #include "tree-iterator.h"
 #include "tree-chkp.h"
 #include "rtl-chkp.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
 
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -40988,6 +40990,47 @@
   return ix86_get_builtin (code);
 }
 
+/* Returns true if given vector type is supported by builtin ptest.
+   NAME is lhs of created ptest call. All created statements are added
+   to GS.  */
+
+static bool
+ix86_vectorize_zero_vector (tree source, tree name, gimple_seq *gs)
+{
+  tree type = TREE_TYPE (source);
+  gimple stmt;
+  enum ix86_builtins code;
+  tree decl, new_type, conv_expr, vec_tmp; 
+
+  gcc_assert (VECTOR_TYPE_P (type));
+  if (!TARGET_AVX)
+    return false;
+
+  switch (tree_to_uhwi (TYPE_SIZE (type)))
+    {
+    case 128:
+      code = IX86_BUILTIN_PTESTZ;
+      break;
+    case 256:
+      if (!TARGET_AVX2)
+	return false;
+      code = IX86_BUILTIN_PTESTZ256;
+      break; 
+    default:
+      return false;
+    }
+  decl = ix86_builtin_decl (code, true);
+  new_type = get_same_sized_vectype (long_long_integer_type_node, type);
+  conv_expr = build1 (VIEW_CONVERT_EXPR, new_type, source);
+  vec_tmp = make_ssa_name (new_type);
+  stmt = gimple_build_assign (vec_tmp, conv_expr);
+  gimple_seq_add_stmt (gs, stmt);
+  stmt = gimple_build_call (decl, 2, vec_tmp, vec_tmp);
+  gimple_call_set_lhs (stmt, name);
+  gimple_seq_add_stmt (gs, stmt);
+  return true;
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -51765,6 +51808,9 @@
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_ZERO_VECTOR
+#define TARGET_VECTORIZE_ZERO_VECTOR ix86_vectorize_zero_vector
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in	(revision 222288)
+++ doc/tm.texi.in	(working copy)
@@ -4241,6 +4241,8 @@
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
+@hook TARGET_VECTORIZE_ZERO_VECTOR
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
Index: target.def
===================================================================
--- target.def	(revision 222288)
+++ target.def	(working copy)
@@ -1801,6 +1801,19 @@
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Returns true if target support zero vector predicate for given vector
+   type.  */
+DEFHOOK
+(zero_vector,
+ "This hook should return boolean true if target supports zero vector\n\
+prdicate.  @var{source} is the compared vector, @var{name} is ssa_name\n\
+containing boolean value true if all vector elements are zero and produced\n\
+statements are saved in @var{gs}.\n\
+The default is @code{false} which means that target does not support it.",
+ bool,
+ (tree source, tree name, gimple_seq *gs),
+ hook_bool_rtx_false)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
Index: testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
===================================================================
--- testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c	(revision 0)
+++ testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c	(working copy)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-mavx2 -O3 -fopenmp-simd -fdump-tree-vect-details" } */
+
+#define N 128
+extern int c[N];
+extern float a1[N], a2[N];
+
+void foo()
+{
+  int i;
+  for (i=0; i<N; i++)
+    if (c[i])
+      a1[i] += a2[i];
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */

Index: tree-if-conv.c
===================================================================
--- tree-if-conv.c	(revision 222288)
+++ tree-if-conv.c	(working copy)
@@ -2172,9 +2172,12 @@
 		gimple_call_set_lhs (new_stmt, lhs);
 	      }
 	    else
-	      new_stmt
-		= gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
-					      mask, rhs);
+	      {
+		new_stmt
+		  = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+						mask, rhs);
+		loop->has_mask_store = true;		
+	      }
 	    gsi_replace (&gsi, new_stmt, true);
 	  }
 	else if (gimple_vdef (stmt))
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 222288)
+++ tree-vect-stmts.c	(working copy)
@@ -88,6 +88,7 @@
 #include "ipa-ref.h"
 #include "cgraph.h"
 #include "builtins.h"
+#include "tree-into-ssa.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -8205,3 +8206,129 @@
   interm_types->release ();
   return false;
 }
+
+/* The code below is trying to remove HW hazzard related to masked stores:
+   the address of masked store can be illegal if a mask is zero.
+   It put all masked stores statements with the same mask into the new bb
+   with a check on zero mask.  */
+
+void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  auto_vec<gimple> worklist;
+
+  if (loop->dont_vectorize || !loop->has_mask_store
+      || loop->num_nodes > 2)
+    return;
+
+  /* This flag won't be used anymore.  */
+  loop->has_mask_store = false;
+
+  /* Pick up all masked stores in loop if any.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	worklist.safe_push (stmt);
+    }
+
+  if (worklist.is_empty ())
+    return;
+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple last, def_stmt;
+      edge e, efalse;
+      tree mask, val, vdef;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      gimple_seq gs;
+      tree arg3;
+
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
+      /* Loop was not vectorized if mask does not have vector type.  */
+      if (!VECTOR_TYPE_P (TREE_TYPE (mask)))
+	return;
+      val = make_ssa_name (integer_type_node);
+      gs = NULL;
+      /* Skip statemnt with unsupported check on zero mask.  */
+
+      if (!targetm.vectorize.zero_vector (mask, val, &gs))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf (MSG_NOTE, "Target does not support ptest!\n");
+	  continue;
+	}
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      store_bb->frequency = bb->frequency / 2;
+      efalse->probability = REG_BR_PROB_BASE / 2;
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
+      if (dom_info_available_p (CDI_DOMINATORS))
+	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
+
+      /* Create zero test condition.  */
+      stmt = gimple_build_cond (NE_EXPR, val, integer_zero_node,
+				NULL_TREE, NULL_TREE);
+      gcc_assert (gs != NULL);
+      gimple_seq_add_stmt (&gs, stmt);
+      gsi = gsi_last_bb (bb);
+      gsi_insert_seq_after (&gsi, gs, GSI_SAME_STMT);
+
+      /* Put all masked stores with the same mask to STORE_BB.  */
+      while (true)
+	{
+	  vdef = gimple_vdef (last);
+	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
+	    mark_virtual_operand_for_renaming (vdef);
+
+	  /* Move masked store to STORE_BB.  */
+	  gsi = gsi_for_stmt (last);
+	  gsi_to = gsi_start_bb (store_bb);
+	  gsi_move_before (&gsi, &gsi_to);
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf (MSG_NOTE, "Move MASK_STORE to bb#%d\n",
+			   store_bb->index);
+	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);	  
+	    }
+	  /* Put definition statement of stored value in STORE_BB
+	     if possible.  */
+	  arg3 = gimple_call_arg (last, 3);
+	  if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+	    {
+	      def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      /* Move def_stmt to STORE_BB if it is in the same bb.  */
+	      if (gimple_bb (def_stmt) == bb)
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf (MSG_NOTE, "Move stmt to bb#%d\n",
+				   store_bb->index);
+		      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);	  
+		    }
+		  gsi = gsi_for_stmt (def_stmt);
+		  gsi_to = gsi_start_bb (store_bb);
+		  gsi_move_before (&gsi, &gsi_to);
+		} 	  
+	    }
+	    /* Put other masked stores with the same mask to STORE_BB.  */
+	    if (worklist.is_empty ()
+		|| gimple_call_arg (worklist.last (), 2) != mask)
+	      break;
+	    last = worklist.pop ();
+	}
+    }
+}
Index: tree-vectorizer.c
===================================================================
--- tree-vectorizer.c	(revision 222288)
+++ tree-vectorizer.c	(working copy)
@@ -454,7 +454,10 @@
 	loop->aux = loop_vinfo;
 
 	if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
-	  continue;
+	  {
+	    loop->has_mask_store = false;
+	    continue;
+	  }
 
         if (!dbg_cnt (vect_loop))
 	  break;
@@ -556,6 +559,7 @@
       loop_vinfo = (loop_vec_info) loop->aux;
       destroy_loop_vec_info (loop_vinfo, true);
       loop->aux = NULL;
+      optimize_mask_stores (loop);
     }
 
   free_stmt_vec_info_vec ();
Index: tree-vectorizer.h
===================================================================
--- tree-vectorizer.h	(revision 222288)
+++ tree-vectorizer.h	(working copy)
@@ -1084,6 +1084,7 @@
 extern tree vect_create_addr_base_for_vector_ref (gimple, gimple_seq *,
 						  tree, struct loop *,
 						  tree = NULL_TREE);
+extern void optimize_mask_stores (struct loop *);
 
 /* In tree-vect-loop.c.  */
 /* FORNOW: Used in tree-parloops.c.  */

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-05-06 14:04 [PATCH] Simple optimization for MASK_STORE Yuri Rumyantsev
@ 2015-05-08  9:27 ` Richard Biener
  2015-05-08 18:43   ` Jeff Law
  2015-05-20 14:10   ` Yuri Rumyantsev
  0 siblings, 2 replies; 33+ messages in thread
From: Richard Biener @ 2015-05-08  9:27 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Wed, May 6, 2015 at 4:04 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi All,
>
> Here is a patch which gives us significant speed-up on HASWELL for
> test containing masked stores. The main goal of that patch is attempt
> to avoid HW hazard for maskmove instructions through inserting
> additional check on zero mask and putting all masked store statements
> into separate block on false edge.All MASK_STORE statements having the
> same mask put into one block. Any comments will be appreciate.

Hmm.  I'm not very happy with this "optimization" happening at the
GIMPLE level - it feels more like a mdreorg thing...

The testcase you add doesn't end up with invalid addresses - so what's
the testcase you are inventing this for?

Looking into the implementation I don't see where you are validating
data dependences of any sort but you are moving stores (and possibly
loads when sinking definition stmts of stored values).  The code-sinking
part should be handled by the existing pass.  Your simple testcase
contains a single masked store, so why does simply conditionalizing
each masked store in mdreorg not work?  It's a hazard (hopefully
fixed eventually), thus not really worth optimizing 100%.

The target hook name is awful.

You don't need a extra flag in struct loop - the vectorizer scans all
insns so it can perfectly well re-compute it.

What this all feels like is more like a un-if-conversion pass which might
be useful for aggressively if-converted vectorized code as well (thus
lots of vec_cond expressions for example).

Richard.

> ChangeLog:
> 2015-05-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * cfgloop.h (has_mask_store): Add new field to struct loop.
> * config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
> (ix86_vectorize_zero_vector): New function.
> (TARGET_VECTORIZE_ZERO_VECTOR): New target macro
> * doc/tm.texi.in: Add @hook TARGET_VECTORIZE_ZERO_VECTOR.
> * doc/tm.texi: Updated.
> * target.def (zero_vector): New DEFHOOK.
> * tree-if-conv.c (predicate_mem_writes): Set has_mask_store for loop.
> * tree-vect-stmts.c : Include tree-into-ssa.h.
> (optimize_mask_stores): New function.
> * tree-vectorizer.c (vectorize_loops): Zero has_mask_store field for
> non-vectorized loops and invoke optimize_mask_stores function.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-05-08  9:27 ` Richard Biener
@ 2015-05-08 18:43   ` Jeff Law
  2015-05-08 19:16     ` Richard Biener
  2015-05-20 14:10   ` Yuri Rumyantsev
  1 sibling, 1 reply; 33+ messages in thread
From: Jeff Law @ 2015-05-08 18:43 UTC (permalink / raw)
  To: Richard Biener, Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On 05/08/2015 03:27 AM, Richard Biener wrote:
> On Wed, May 6, 2015 at 4:04 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi All,
>>
>> Here is a patch which gives us significant speed-up on HASWELL for
>> test containing masked stores. The main goal of that patch is attempt
>> to avoid HW hazard for maskmove instructions through inserting
>> additional check on zero mask and putting all masked store statements
>> into separate block on false edge.All MASK_STORE statements having the
>> same mask put into one block. Any comments will be appreciate.
>
> Hmm.  I'm not very happy with this "optimization" happening at the
> GIMPLE level - it feels more like a mdreorg thing...
I haven't looked at the patch, but it sounds like just specialization 
using a runtime check to select between the implementations which is 
fairly natural to do at the gimple level.

Jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-05-08 18:43   ` Jeff Law
@ 2015-05-08 19:16     ` Richard Biener
  0 siblings, 0 replies; 33+ messages in thread
From: Richard Biener @ 2015-05-08 19:16 UTC (permalink / raw)
  To: Jeff Law, Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On May 8, 2015 8:43:15 PM GMT+02:00, Jeff Law <law@redhat.com> wrote:
>On 05/08/2015 03:27 AM, Richard Biener wrote:
>> On Wed, May 6, 2015 at 4:04 PM, Yuri Rumyantsev <ysrumyan@gmail.com>
>wrote:
>>> Hi All,
>>>
>>> Here is a patch which gives us significant speed-up on HASWELL for
>>> test containing masked stores. The main goal of that patch is
>attempt
>>> to avoid HW hazard for maskmove instructions through inserting
>>> additional check on zero mask and putting all masked store
>statements
>>> into separate block on false edge.All MASK_STORE statements having
>the
>>> same mask put into one block. Any comments will be appreciate.
>>
>> Hmm.  I'm not very happy with this "optimization" happening at the
>> GIMPLE level - it feels more like a mdreorg thing...
>I haven't looked at the patch, but it sounds like just specialization 
>using a runtime check to select between the implementations which is 
>fairly natural to do at the gimple level.

Well, sure.  Which is why I suggested a un-ifcvt pass instead.

Anyway, the implementation has multiple issues.

Richard.

>Jeff


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-05-08  9:27 ` Richard Biener
  2015-05-08 18:43   ` Jeff Law
@ 2015-05-20 14:10   ` Yuri Rumyantsev
  2015-05-29 14:28     ` Yuri Rumyantsev
  2015-06-09 12:15     ` Richard Biener
  1 sibling, 2 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-05-20 14:10 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 1470 bytes --]

Hi All,

Here is updated patch to optimize mask stores. The main goal of it is
to avoid execution of mask store if its mask is zero vector since
loads that follow it can be blocked.
The following changes were done:
  1.  A test on sink legality was added - it simply prohibits to cross
statements with non-null vdef or vuse.
  2. New phi node is created in join block for moved MASK_STORE statements.
  3. Test was changed to check that 2 MASK_STORE statements are not
moved to the same block.
  4. New field was added to loop_vec_info structure to mark loops
having MASK_STORE's.

Any comments will be appreciated.
Yuri.

2015-05-20  Yuri Rumyantsev  <ysrumyan@gmail.com>

* config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
(ix86_vectorize_is_zero_vector): New function.
(TARGET_VECTORIZE_IS_ZERO_VECTOR): New target macro
* doc/tm.texi.in: Add @hook TARGET_VECTORIZE_IS_ZERO_VECTOR.
* doc/tm.texi: Updated.
* target.def (is_zero_vector): New DEFHOOK.
* tree-vect-stmts.c : Include tree-into-ssa.h.
(vectorizable_mask_load_store): Initialize has_mask_store field.
(is_valid_sink): New function.
(optimize_mask_stores): New function.
* tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
loops having masked stores.
* tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
correspondent macros.
(optimize_mask_stores): Update prototype.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

[-- Attachment #2: patch.2 --]
[-- Type: application/octet-stream, Size: 12544 bytes --]

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 222288)
+++ config/i386/i386.c	(working copy)
@@ -115,6 +115,8 @@
 #include "tree-iterator.h"
 #include "tree-chkp.h"
 #include "rtl-chkp.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
 
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -40988,6 +40990,47 @@
   return ix86_get_builtin (code);
 }
 
+/* Returns true if SOURCE type is supported by builtin ptest.
+   NAME is lhs of created ptest call.  All created statements are added
+   to GS.  */
+
+static bool
+ix86_vectorize_build_zero_vector_test (tree source, tree name, gimple_seq *gs)
+{
+  tree type = TREE_TYPE (source);
+  gimple stmt;
+  enum ix86_builtins code;
+  tree decl, new_type, conv_expr, vec_tmp;
+
+  gcc_assert (VECTOR_TYPE_P (type));
+  if (!TARGET_AVX)
+    return false;
+
+  switch (tree_to_uhwi (TYPE_SIZE (type)))
+    {
+    case 128:
+      code = IX86_BUILTIN_PTESTZ;
+      break;
+    case 256:
+      if (!TARGET_AVX2)
+	return false;
+      code = IX86_BUILTIN_PTESTZ256;
+      break;
+    default:
+      return false;
+    }
+  decl = ix86_builtin_decl (code, true);
+  new_type = get_same_sized_vectype (long_long_integer_type_node, type);
+  conv_expr = build1 (VIEW_CONVERT_EXPR, new_type, source);
+  vec_tmp = make_ssa_name (new_type);
+  stmt = gimple_build_assign (vec_tmp, conv_expr);
+  gimple_seq_add_stmt (gs, stmt);
+  stmt = gimple_build_call (decl, 2, vec_tmp, vec_tmp);
+  gimple_call_set_lhs (stmt, name);
+  gimple_seq_add_stmt (gs, stmt);
+  return true;
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -51765,6 +51808,10 @@
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST
+#define TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST \
+  ix86_vectorize_build_zero_vector_test
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in	(revision 222288)
+++ doc/tm.texi.in	(working copy)
@@ -4241,6 +4241,8 @@
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
+@hook TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
Index: target.def
===================================================================
--- target.def	(revision 222288)
+++ target.def	(working copy)
@@ -1801,6 +1801,19 @@
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Returns true if target support zero vector predicate for given vector
+   type.  */
+DEFHOOK
+(build_zero_vector_test,
+ "This hook should return boolean true if target supports zero vector\n\
+predicate.  @var{source} is the compared vector, @var{name} is ssa_name\n\
+containing boolean value true if all vector elements are zero and produced\n\
+statements are saved in @var{gs}.\n\
+The default is @code{false} which means that target does not support it.",
+ bool,
+ (tree source, tree name, gimple_seq *gs),
+ hook_bool_rtx_false)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
Index: testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
===================================================================
--- testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c	(revision 0)
+++ testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c	(working copy)
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-mavx2 -O3 -fopenmp-simd -fdump-tree-vect-details" } */
+
+extern int *p1, *p2, *p3;
+int c[256];
+
+void foo (int n)
+{
+  int i;
+  for (i=0; i<n; i++)
+    if (c[i])
+      {
+	p1[i] += 1;
+	p2[i] = p3[i] +2;
+      }
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 222288)
+++ tree-vect-stmts.c	(working copy)
@@ -88,6 +88,7 @@
 #include "ipa-ref.h"
 #include "cgraph.h"
 #include "builtins.h"
+#include "tree-into-ssa.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -2073,6 +2074,7 @@
     {
       tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
       prev_stmt_info = NULL;
+      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
       for (i = 0; i < ncopies; i++)
 	{
 	  unsigned align, misalign;
@@ -8205,3 +8207,191 @@
   interm_types->release ();
   return false;
 }
+
+/* Helper for optimize_mask_stores: returns true if STMT sinking to end
+   of BB is valid and false otherwise.  */
+
+static bool
+is_valid_sink (gimple stmt)
+{
+  gimple_stmt_iterator it = gsi_for_stmt (stmt);
+  gimple curr_stmt;
+
+  gsi_next (&it);
+  if (gimple_vdef (stmt))
+    {
+      /* Check that store does not cross its vuses.  */
+      for (; !gsi_end_p (it); gsi_next (&it))
+	{
+	  curr_stmt = gsi_stmt (it);
+	  if (gimple_has_mem_ops (curr_stmt)
+	      && (gimple_vdef (curr_stmt) != NULL_TREE
+		  || gimple_vuse (curr_stmt) != NULL_TREE))
+	    return false;
+	}
+    }
+  else
+    {
+      /* Check that load does not cross any statement with vdef.  */
+      for (it = gsi_for_stmt (stmt); !gsi_end_p (it); gsi_next (&it))
+	{
+	  curr_stmt = gsi_stmt (it);
+	  if (gimple_vdef (curr_stmt) != NULL_TREE)
+	    return false;
+	}
+    }
+  return true;
+}
+
+/* The code below is trying to perform simple optimization - do not execute
+   masked store statement if its mask is zero mask since loads that follow
+   a masked store can be blocked.
+   It put all masked stores statements with the same mask into the new bb
+   with a check on zero mask.  */
+
+void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  auto_vec<gimple> worklist;
+
+  if (loop->dont_vectorize
+      || loop->num_nodes > 2)
+    return;
+
+  /* Pick up all masked stores in loop if any.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	worklist.safe_push (stmt);
+    }
+
+  if (worklist.is_empty ())
+    return;
+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple last, def_stmt, last_store;
+      edge e, efalse;
+      tree mask, val;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      gimple_seq gs;
+      tree arg3;
+      tree vdef, new_vdef;
+      gphi *phi;
+      bool first_dump;
+
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
+      /* Loop was not vectorized if mask does not have vector type.  */
+      if (!VECTOR_TYPE_P (TREE_TYPE (mask)))
+	return;
+      val = make_ssa_name (integer_type_node);
+      gs = NULL;
+      /* Escape if target does not support test on zero mask.  */
+
+      if (!targetm.vectorize.build_zero_vector_test (mask, val, &gs))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf (MSG_NOTE, "Target does not support ptest!\n");
+	  return;
+	}
+      gcc_assert (gs != NULL);
+
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      store_bb->frequency = bb->frequency / 2;
+      efalse->probability = REG_BR_PROB_BASE / 2;
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
+      if (dom_info_available_p (CDI_DOMINATORS))
+	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
+      if (dom_info_available_p (CDI_POST_DOMINATORS))
+	set_immediate_dominator (CDI_POST_DOMINATORS, bb, store_bb);
+
+      /* Create zero test condition.  */
+      stmt = gimple_build_cond (NE_EXPR, val, integer_zero_node,
+				NULL_TREE, NULL_TREE);
+      gimple_seq_add_stmt (&gs, stmt);
+      gsi = gsi_last_bb (bb);
+      gsi_insert_seq_after (&gsi, gs, GSI_SAME_STMT);
+      /* Create new PHI node for vdef of the last masked store:
+	 .MEM_2 = VDEF <.MEM_1>
+	 will be converted to
+	 .MEM.3 = VDEF <.MEM_1>
+	 and new PHI node will be created in join bb
+	 .MEM_2 = PHI <.MEM_1, .MEM_3>
+      */
+      vdef = gimple_vdef (last);
+      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
+      new_vdef = make_ssa_name (gimple_vop (cfun), last);
+      phi = create_phi_node (vdef, join_bb);
+      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
+      gimple_set_vdef (last, new_vdef);
+      first_dump = true;
+
+      /* Put all masked stores with the same mask to STORE_BB if possible.  */
+      while (true)
+	{
+	  /* Move masked store to STORE_BB.  */
+	  last_store = last;
+	  gsi = gsi_for_stmt (last);
+	  gsi_to = gsi_start_bb (store_bb);
+	  gsi_move_before (&gsi, &gsi_to);
+	  update_stmt (last);
+	  if (dump_enabled_p ())
+	    {
+	      /* Issue different messages depending on FIRST_DUMP.  */
+	      if (first_dump)
+		{
+		  dump_printf (MSG_NOTE, "Move MASK_STORE to new bb#%d\n",
+			       store_bb->index);
+		  first_dump = false;
+		}
+	      else
+		dump_printf (MSG_NOTE, "Move MASK_STORE to created bb\n");
+	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
+	    }
+	  /* Put definition statement of stored value in STORE_BB
+	     if possible.  */
+	  arg3 = gimple_call_arg (last, 3);
+	  if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+	    {
+	      def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      /* Move def_stmt to STORE_BB if it is in the same bb and
+		 it is legal.  */
+	      if (gimple_bb (def_stmt) == bb
+		  && is_valid_sink (def_stmt))
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf (MSG_NOTE, "Move stmt to created bb\n");
+		      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+		    }
+		  gsi = gsi_for_stmt (def_stmt);
+		  gsi_to = gsi_start_bb (store_bb);
+		  gsi_move_before (&gsi, &gsi_to);
+		  update_stmt (def_stmt);
+		}
+	    }
+	    /* Put other masked stores with the same mask to STORE_BB.  */
+	    if (worklist.is_empty ()
+		|| gimple_call_arg (worklist.last (), 2) != mask
+		|| !is_valid_sink (worklist.last ()))
+	      break;
+	    last = worklist.pop ();
+	}
+      add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
+    }
+}
Index: tree-vectorizer.c
===================================================================
--- tree-vectorizer.c	(revision 222288)
+++ tree-vectorizer.c	(working copy)
@@ -549,13 +549,19 @@
   for (i = 1; i < vect_loops_num; i++)
     {
       loop_vec_info loop_vinfo;
+      bool has_mask_store;
 
       loop = get_loop (cfun, i);
       if (!loop)
 	continue;
       loop_vinfo = (loop_vec_info) loop->aux;
+      has_mask_store = false;
+      if (loop_vinfo)
+	has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
       destroy_loop_vec_info (loop_vinfo, true);
       loop->aux = NULL;
+      if (has_mask_store)
+	optimize_mask_stores (loop);
     }
 
   free_stmt_vec_info_vec ();
Index: tree-vectorizer.h
===================================================================
--- tree-vectorizer.h	(revision 222288)
+++ tree-vectorizer.h	(working copy)
@@ -375,6 +375,9 @@
      loop version without if-conversion.  */
   struct loop *scalar_loop;
 
+  /* Mark loops having masked stores.  */
+  bool has_mask_store;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -412,6 +415,7 @@
 #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
 #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
+#define LOOP_VINFO_HAS_MASK_STORE(L)	   (L)->has_mask_store
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
   ((L)->may_misalign_stmts.length () > 0)
@@ -1044,6 +1048,7 @@
 			       vec<tree> *, slp_tree, int);
 extern tree vect_gen_perm_mask_any (tree, const unsigned char *);
 extern tree vect_gen_perm_mask_checked (tree, const unsigned char *);
+extern void optimize_mask_stores (struct loop *);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-05-20 14:10   ` Yuri Rumyantsev
@ 2015-05-29 14:28     ` Yuri Rumyantsev
  2015-06-09 12:15     ` Richard Biener
  1 sibling, 0 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-05-29 14:28 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

Hi Richard,

Did you have a chance to look at my updated patch?

Any comments will be appreciated.
Yuri.

2015-05-20 17:00 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
> Hi All,
>
> Here is updated patch to optimize mask stores. The main goal of it is
> to avoid execution of mask store if its mask is zero vector since
> loads that follow it can be blocked.
> The following changes were done:
>   1.  A test on sink legality was added - it simply prohibits to cross
> statements with non-null vdef or vuse.
>   2. New phi node is created in join block for moved MASK_STORE statements.
>   3. Test was changed to check that 2 MASK_STORE statements are not
> moved to the same block.
>   4. New field was added to loop_vec_info structure to mark loops
> having MASK_STORE's.
>
> Any comments will be appreciated.
> Yuri.
>
> 2015-05-20  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
> (ix86_vectorize_is_zero_vector): New function.
> (TARGET_VECTORIZE_IS_ZERO_VECTOR): New target macro
> * doc/tm.texi.in: Add @hook TARGET_VECTORIZE_IS_ZERO_VECTOR.
> * doc/tm.texi: Updated.
> * target.def (is_zero_vector): New DEFHOOK.
> * tree-vect-stmts.c : Include tree-into-ssa.h.
> (vectorizable_mask_load_store): Initialize has_mask_store field.
> (is_valid_sink): New function.
> (optimize_mask_stores): New function.
> * tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
> loops having masked stores.
> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
> correspondent macros.
> (optimize_mask_stores): Update prototype.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-05-20 14:10   ` Yuri Rumyantsev
  2015-05-29 14:28     ` Yuri Rumyantsev
@ 2015-06-09 12:15     ` Richard Biener
  2015-06-18 15:41       ` Yuri Rumyantsev
  1 sibling, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-06-09 12:15 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Wed, May 20, 2015 at 4:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi All,
>
> Here is updated patch to optimize mask stores. The main goal of it is
> to avoid execution of mask store if its mask is zero vector since
> loads that follow it can be blocked.
> The following changes were done:
>   1.  A test on sink legality was added - it simply prohibits to cross
> statements with non-null vdef or vuse.
>   2. New phi node is created in join block for moved MASK_STORE statements.
>   3. Test was changed to check that 2 MASK_STORE statements are not
> moved to the same block.
>   4. New field was added to loop_vec_info structure to mark loops
> having MASK_STORE's.
>
> Any comments will be appreciated.
> Yuri.

I still don't understand why you need the new target hook.  If we have a masked
load/store then the mask is computed by an assignment with a VEC_COND_EXPR
(in your example) and thus a test for a zero mask is expressible as just

 if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })

or am I missing something?  As we dont' want this transform on all targets
(or always) we can control it by either a --param or a new flag which default
targets can adjust.  [Is the hazard still present with Skylake or other
AVX512 implementations for example?]

+/* Helper for optimize_mask_stores: returns true if STMT sinking to end
+   of BB is valid and false otherwise.  */
+
+static bool
+is_valid_sink (gimple stmt)
+{

so STMT is either a masked store or a masked load?  You shouldn't
walk all stmts here but instead rely on the factored use-def def-use
chains of virtual operands.

+void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  auto_vec<gimple> worklist;
+
+  if (loop->dont_vectorize
+      || loop->num_nodes > 2)
+    return;

looks like no longer needed given the place this is called from now
(or does it guard against outer loop vectorization as well?)
Also put it into tree-vect-loop.c and make it static.

+      /* Loop was not vectorized if mask does not have vector type.  */
+      if (!VECTOR_TYPE_P (TREE_TYPE (mask)))
+       return;

this should be always false

+      store_bb->frequency = bb->frequency / 2;
+      efalse->probability = REG_BR_PROB_BASE / 2;

I think the == 0 case should end up as unlikely.

+      if (dom_info_available_p (CDI_POST_DOMINATORS))
+       set_immediate_dominator (CDI_POST_DOMINATORS, bb, store_bb);

post dominators are not available in the vectorizer.

+         /* Put definition statement of stored value in STORE_BB
+            if possible.  */
+         arg3 = gimple_call_arg (last, 3);
+         if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+           {
+             def_stmt = SSA_NAME_DEF_STMT (arg3);
+             /* Move def_stmt to STORE_BB if it is in the same bb and
+                it is legal.  */
+             if (gimple_bb (def_stmt) == bb
+                 && is_valid_sink (def_stmt))

ok, so you move arbitrary statements.  You can always move non-memory
statements and if you keep track of the last store you moved you
can check if gimple_vuse () is the same as on that last store and be
done with that, or if you sink another store (under the same condition)
then just update the last store.

Otherwise this looks better now.

Thus - get rid of the target hook and of is_valid_sink.

Thanks,
Richard.

> 2015-05-20  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
> (ix86_vectorize_is_zero_vector): New function.
> (TARGET_VECTORIZE_IS_ZERO_VECTOR): New target macro
> * doc/tm.texi.in: Add @hook TARGET_VECTORIZE_IS_ZERO_VECTOR.
> * doc/tm.texi: Updated.
> * target.def (is_zero_vector): New DEFHOOK.
> * tree-vect-stmts.c : Include tree-into-ssa.h.
> (vectorizable_mask_load_store): Initialize has_mask_store field.
> (is_valid_sink): New function.
> (optimize_mask_stores): New function.
> * tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
> loops having masked stores.
> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
> correspondent macros.
> (optimize_mask_stores): Update prototype.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-06-09 12:15     ` Richard Biener
@ 2015-06-18 15:41       ` Yuri Rumyantsev
  2015-07-07 13:55         ` Yuri Rumyantsev
  2015-07-10  5:51         ` Jeff Law
  0 siblings, 2 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-06-18 15:41 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 6531 bytes --]

Richard,

Here is updated patch which does not include your proposal related to
the target hook deletion.
You wrote:
> I still don't understand why you need the new target hook.  If we have a masked
> load/store then the mask is computed by an assignment with a VEC_COND_EXPR
> (in your example) and thus a test for a zero mask is expressible as just
>
 > if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })
>
> or am I missing something?
Such vector compare produces vector and does not set up cc flags
required for conditional branch (GIMPLE_COND).
If we use such comparison for GIMPLE_COND we got error message, so I
used target hook which does set up cc flags aka ptest instruction and
I left this part.
I moved new routines to loop-vectorizer.c file and both they are static.
I re-wrote is_valid_sink function to use def-use chain as you proposed.
I also add new parameter to control such transformation.
Few redundant tests have also been deleted.
Any comments will be appreciated.

Thanks.
Yuri.

2015-06-18  Yuri Rumyantsev  <ysrumyan@gmail.com>

* config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
(ix86_vectorize_build_zero_vector_test): New function.
(TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST): New target macro
* doc/tm.texi.in: Add @hook TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST.
* doc/tm.texi: Updated.
* params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
* params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
* target.def (build_zero_vector_test): New DEFHOOK.
* tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
has_mask_store field of vect_info.
* tree-vectorizer.c: Include files stringpool.h and tree-ssanames.h.
(is_valid_sink): New function.
(optimize_mask_stores): New function.
(vectorize_loops): Invoke optimaze_mask_stores for loops having masked
stores.
* tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
correspondent macros.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.



2015-06-09 15:13 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Wed, May 20, 2015 at 4:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi All,
>>
>> Here is updated patch to optimize mask stores. The main goal of it is
>> to avoid execution of mask store if its mask is zero vector since
>> loads that follow it can be blocked.
>> The following changes were done:
>>   1.  A test on sink legality was added - it simply prohibits to cross
>> statements with non-null vdef or vuse.
>>   2. New phi node is created in join block for moved MASK_STORE statements.
>>   3. Test was changed to check that 2 MASK_STORE statements are not
>> moved to the same block.
>>   4. New field was added to loop_vec_info structure to mark loops
>> having MASK_STORE's.
>>
>> Any comments will be appreciated.
>> Yuri.
>
> I still don't understand why you need the new target hook.  If we have a masked
> load/store then the mask is computed by an assignment with a VEC_COND_EXPR
> (in your example) and thus a test for a zero mask is expressible as just
>
>  if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })
>
> or am I missing something?  As we dont' want this transform on all targets
> (or always) we can control it by either a --param or a new flag which default
> targets can adjust.  [Is the hazard still present with Skylake or other
> AVX512 implementations for example?]
>
> +/* Helper for optimize_mask_stores: returns true if STMT sinking to end
> +   of BB is valid and false otherwise.  */
> +
> +static bool
> +is_valid_sink (gimple stmt)
> +{
>
> so STMT is either a masked store or a masked load?  You shouldn't
> walk all stmts here but instead rely on the factored use-def def-use
> chains of virtual operands.
>
> +void
> +optimize_mask_stores (struct loop *loop)
> +{
> +  basic_block bb = loop->header;
> +  gimple_stmt_iterator gsi;
> +  gimple stmt;
> +  auto_vec<gimple> worklist;
> +
> +  if (loop->dont_vectorize
> +      || loop->num_nodes > 2)
> +    return;
>
> looks like no longer needed given the place this is called from now
> (or does it guard against outer loop vectorization as well?)
> Also put it into tree-vect-loop.c and make it static.
>
> +      /* Loop was not vectorized if mask does not have vector type.  */
> +      if (!VECTOR_TYPE_P (TREE_TYPE (mask)))
> +       return;
>
> this should be always false
>
> +      store_bb->frequency = bb->frequency / 2;
> +      efalse->probability = REG_BR_PROB_BASE / 2;
>
> I think the == 0 case should end up as unlikely.
>
> +      if (dom_info_available_p (CDI_POST_DOMINATORS))
> +       set_immediate_dominator (CDI_POST_DOMINATORS, bb, store_bb);
>
> post dominators are not available in the vectorizer.
>
> +         /* Put definition statement of stored value in STORE_BB
> +            if possible.  */
> +         arg3 = gimple_call_arg (last, 3);
> +         if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
> +           {
> +             def_stmt = SSA_NAME_DEF_STMT (arg3);
> +             /* Move def_stmt to STORE_BB if it is in the same bb and
> +                it is legal.  */
> +             if (gimple_bb (def_stmt) == bb
> +                 && is_valid_sink (def_stmt))
>
> ok, so you move arbitrary statements.  You can always move non-memory
> statements and if you keep track of the last store you moved you
> can check if gimple_vuse () is the same as on that last store and be
> done with that, or if you sink another store (under the same condition)
> then just update the last store.
>
> Otherwise this looks better now.
>
> Thus - get rid of the target hook and of is_valid_sink.
>
> Thanks,
> Richard.
>
>> 2015-05-20  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
>> (ix86_vectorize_is_zero_vector): New function.
>> (TARGET_VECTORIZE_IS_ZERO_VECTOR): New target macro
>> * doc/tm.texi.in: Add @hook TARGET_VECTORIZE_IS_ZERO_VECTOR.
>> * doc/tm.texi: Updated.
>> * target.def (is_zero_vector): New DEFHOOK.
>> * tree-vect-stmts.c : Include tree-into-ssa.h.
>> (vectorizable_mask_load_store): Initialize has_mask_store field.
>> (is_valid_sink): New function.
>> (optimize_mask_stores): New function.
>> * tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
>> loops having masked stores.
>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>> correspondent macros.
>> (optimize_mask_stores): Update prototype.
>>
>> gcc/testsuite/ChangeLog:
>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

[-- Attachment #2: patch.3 --]
[-- Type: application/octet-stream, Size: 13528 bytes --]

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 44a8624..e90de32 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -100,6 +100,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-iterator.h"
 #include "tree-chkp.h"
 #include "rtl-chkp.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
 
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -41100,6 +41102,47 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   return ix86_get_builtin (code);
 }
 
+/* Returns true if SOURCE type is supported by builtin ptest.
+   NAME is lhs of created ptest call.  All created statements are added
+   to GS.  */
+
+static bool
+ix86_vectorize_build_zero_vector_test (tree source, tree name, gimple_seq *gs)
+{
+  tree type = TREE_TYPE (source);
+  gimple stmt;
+  enum ix86_builtins code;
+  tree decl, new_type, conv_expr, vec_tmp;
+
+  gcc_assert (VECTOR_TYPE_P (type));
+  if (!TARGET_AVX)
+    return false;
+
+  switch (tree_to_uhwi (TYPE_SIZE (type)))
+    {
+    case 128:
+      code = IX86_BUILTIN_PTESTZ;
+      break;
+    case 256:
+      if (!TARGET_AVX2)
+	return false;
+      code = IX86_BUILTIN_PTESTZ256;
+      break;
+    default:
+      return false;
+    }
+  decl = ix86_builtin_decl (code, true);
+  new_type = get_same_sized_vectype (long_long_integer_type_node, type);
+  conv_expr = build1 (VIEW_CONVERT_EXPR, new_type, source);
+  vec_tmp = make_ssa_name (new_type);
+  stmt = gimple_build_assign (vec_tmp, conv_expr);
+  gimple_seq_add_stmt (gs, stmt);
+  stmt = gimple_build_call (decl, 2, vec_tmp, vec_tmp);
+  gimple_call_set_lhs (stmt, name);
+  gimple_seq_add_stmt (gs, stmt);
+  return true;
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -51947,6 +51990,10 @@ ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST
+#define TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST \
+  ix86_vectorize_build_zero_vector_test
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 93fb41c..70d04ee 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4239,6 +4239,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
+@hook TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
diff --git a/gcc/params.def b/gcc/params.def
index 3e4ba3a..9e8de11 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1069,6 +1069,12 @@ DEFPARAM (PARAM_MAX_STORES_TO_SINK,
           "Maximum number of conditional store pairs that can be sunk",
           2, 0, 0)
 
+/* Enable inserion test on zero mask for masked stores if non-zero.  */
+DEFPARAM (PARAM_ZERO_TEST_FOR_STORE_MASK,
+	  "zero-test-for-store-mask",
+	  "Enable insertion of test on zero mask for masked stores",
+	  1, 0, 1)
+
 /* Override CASE_VALUES_THRESHOLD of when to switch from doing switch
    statements via if statements to using a table jump operation.  If the value
    is 0, the default CASE_VALUES_THRESHOLD will be used.  */
diff --git a/gcc/params.h b/gcc/params.h
index f53426d..a3ea081 100644
--- a/gcc/params.h
+++ b/gcc/params.h
@@ -218,6 +218,8 @@ extern void init_param_values (int *params);
   PARAM_VALUE (PARAM_MIN_NONDEBUG_INSN_UID)
 #define MAX_STORES_TO_SINK \
   PARAM_VALUE (PARAM_MAX_STORES_TO_SINK)
+#define ENABLE_ZERO_TEST_FOR_STORE_MASK \
+  PARAM_VALUE (PARAM_ZERO_TEST_FOR_STORE_MASK)
 #define ALLOW_LOAD_DATA_RACES \
   PARAM_VALUE (PARAM_ALLOW_LOAD_DATA_RACES)
 #define ALLOW_STORE_DATA_RACES \
diff --git a/gcc/target.def b/gcc/target.def
index b606b81..43b4586 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1801,6 +1801,19 @@ loads.",
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Returns true if target support zero vector predicate for given vector
+   type.  */
+DEFHOOK
+(build_zero_vector_test,
+ "This hook should return boolean true if target supports zero vector\n\
+predicate.  @var{source} is the compared vector, @var{name} is ssa_name\n\
+containing boolean value true if all vector elements are zero and produced\n\
+statements are saved in @var{gs}.\n\
+The default is @code{false} which means that target does not support it.",
+ bool,
+ (tree source, tree name, gimple_seq *gs),
+ hook_bool_rtx_false)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
new file mode 100644
index 0000000..f0c3638
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-mavx2 -O3 -fopenmp-simd -fdump-tree-vect-details" } */
+
+extern int *p1, *p2, *p3;
+int c[256];
+
+void foo (int n)
+{
+  int i;
+  for (i=0; i<n; i++)
+    if (c[i])
+      {
+	p1[i] += 1;
+	p2[i] = p3[i] +2;
+      }
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 7ba0d8f..e31479b 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2072,6 +2072,7 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
     {
       tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
       prev_stmt_info = NULL;
+      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
       for (i = 0; i < ncopies; i++)
 	{
 	  unsigned align, misalign;
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index ff46a52..debb351 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -92,7 +92,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "dbgcnt.h"
 #include "gimple-fold.h"
 #include "tree-scalar-evolution.h"
-
+#include "stringpool.h"
+#include "tree-ssanames.h"
 
 /* Loop or bb location.  */
 source_location vect_location;
@@ -422,6 +423,184 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple loop_vectorized_call)
   free (bbs);
 }
 
+/* Helper for optimize_mask_stores: returns true if STMT sinking to end
+   of BB is valid and false otherwise.  */
+
+static bool
+is_valid_sink (gimple stmt, gimple last_store)
+{
+  tree vdef;
+  imm_use_iterator imm_it;
+  gimple use_stmt;
+  basic_block bb = gimple_bb (stmt);
+
+  if ((vdef = gimple_vdef (stmt)))
+    {
+      int cross = 0;
+      /* Check that ther are no store vuses in current bb.  */
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_it, vdef)
+	if (gimple_bb (use_stmt) == bb)
+	  cross++;
+
+      if (cross == 0)
+	return true;
+    }
+  else if (gimple_vuse (stmt) == NULL_TREE)
+    return true;
+  else if (gimple_vuse (stmt) == gimple_vuse (last_store))
+    return true;
+  return false;
+}
+
+/* The code below is trying to perform simple optimization - do not execute
+   masked store statement if its mask is zero mask since loads that follow
+   a masked store can be blocked.
+   It put all masked stores statements with the same mask into the new bb
+   with a check on zero mask.  */
+
+static void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  auto_vec<gimple> worklist;
+
+  if (ENABLE_ZERO_TEST_FOR_STORE_MASK == 0)
+    return;
+
+  /* Pick up all masked stores in loop if any.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	worklist.safe_push (stmt);
+    }
+
+  if (worklist.is_empty ())
+    return;
+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple last, def_stmt, last_store;
+      edge e, efalse;
+      tree mask, val;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      gimple_seq gs;
+      tree arg3;
+      tree vdef, new_vdef;
+      gphi *phi;
+      bool first_dump;
+
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
+      val = make_ssa_name (integer_type_node);
+      gs = NULL;
+      /* Escape if target does not support test on zero mask.  */
+
+      if (!targetm.vectorize.build_zero_vector_test (mask, val, &gs))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf (MSG_NOTE, "Target does not support ptest!\n");
+	  return;
+	}
+      gcc_assert (gs != NULL);
+
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      /* Don't want to put STORE_BB to unlikely part and use
+	 50% probability.  */
+      store_bb->frequency = bb->frequency / 2;
+      efalse->probability = REG_BR_PROB_BASE / 2;
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
+      if (dom_info_available_p (CDI_DOMINATORS))
+	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
+
+      /* Create zero test condition.  */
+      stmt = gimple_build_cond (NE_EXPR, val, integer_zero_node,
+				NULL_TREE, NULL_TREE);
+      gimple_seq_add_stmt (&gs, stmt);
+      gsi = gsi_last_bb (bb);
+      gsi_insert_seq_after (&gsi, gs, GSI_SAME_STMT);
+      /* Create new PHI node for vdef of the last masked store:
+	 .MEM_2 = VDEF <.MEM_1>
+	 will be converted to
+	 .MEM.3 = VDEF <.MEM_1>
+	 and new PHI node will be created in join bb
+	 .MEM_2 = PHI <.MEM_1, .MEM_3>
+      */
+      vdef = gimple_vdef (last);
+      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
+      new_vdef = make_ssa_name (gimple_vop (cfun), last);
+      phi = create_phi_node (vdef, join_bb);
+      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
+      gimple_set_vdef (last, new_vdef);
+      first_dump = true;
+
+      /* Put all masked stores with the same mask to STORE_BB if possible.  */
+      while (true)
+	{
+	  /* Move masked store to STORE_BB.  */
+	  last_store = last;
+	  gsi = gsi_for_stmt (last);
+	  gsi_to = gsi_start_bb (store_bb);
+	  gsi_move_before (&gsi, &gsi_to);
+	  update_stmt (last);
+	  if (dump_enabled_p ())
+	    {
+	      /* Issue different messages depending on FIRST_DUMP.  */
+	      if (first_dump)
+		{
+		  dump_printf (MSG_NOTE, "Move MASK_STORE to new bb#%d\n",
+			       store_bb->index);
+		  first_dump = false;
+		}
+	      else
+		dump_printf (MSG_NOTE, "Move MASK_STORE to created bb\n");
+	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
+	    }
+	  /* Put definition statement of stored value in STORE_BB
+	     if possible.  */
+	  arg3 = gimple_call_arg (last, 3);
+	  if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+	    {
+	      def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      /* Move def_stmt to STORE_BB if it is in the same bb and
+		 it is legal.  */
+	      if (gimple_bb (def_stmt) == bb
+		  && is_valid_sink (def_stmt, last_store))
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf (MSG_NOTE, "Move stmt to created bb\n");
+		      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+		    }
+		  gsi = gsi_for_stmt (def_stmt);
+		  gsi_to = gsi_start_bb (store_bb);
+		  gsi_move_before (&gsi, &gsi_to);
+		  update_stmt (def_stmt);
+		}
+	    }
+	    /* Put other masked stores with the same mask to STORE_BB.  */
+	    if (worklist.is_empty ()
+		|| gimple_call_arg (worklist.last (), 2) != mask
+		|| !is_valid_sink (worklist.last (), last_store))
+	      break;
+	    last = worklist.pop ();
+	}
+      add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
+    }
+}
+
 /* Function vectorize_loops.
 
    Entry point to loop vectorization phase.  */
@@ -542,13 +721,19 @@ vectorize_loops (void)
   for (i = 1; i < vect_loops_num; i++)
     {
       loop_vec_info loop_vinfo;
+      bool has_mask_store;
 
       loop = get_loop (cfun, i);
       if (!loop)
 	continue;
       loop_vinfo = (loop_vec_info) loop->aux;
+      has_mask_store = false;
+      if (loop_vinfo)
+	has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
       destroy_loop_vec_info (loop_vinfo, true);
       loop->aux = NULL;
+      if (has_mask_store)
+	optimize_mask_stores (loop);
     }
 
   free_stmt_vec_info_vec ();
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b7ce941..eb8e19c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -369,6 +369,9 @@ typedef struct _loop_vec_info {
      loop version without if-conversion.  */
   struct loop *scalar_loop;
 
+  /* Mark loops having masked stores.  */
+  bool has_mask_store;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -406,6 +409,7 @@ typedef struct _loop_vec_info {
 #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
 #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
+#define LOOP_VINFO_HAS_MASK_STORE(L)	   (L)->has_mask_store
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
   ((L)->may_misalign_stmts.length () > 0)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-06-18 15:41       ` Yuri Rumyantsev
@ 2015-07-07 13:55         ` Yuri Rumyantsev
  2015-07-10  5:51         ` Jeff Law
  1 sibling, 0 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-07-07 13:55 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Igor Zamyatin

Hi Richard,

Did you have a chance to look at this updated patch?

Thanks.
Yuri.

2015-06-18 17:32 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
> Richard,
>
> Here is updated patch which does not include your proposal related to
> the target hook deletion.
> You wrote:
>> I still don't understand why you need the new target hook.  If we have a masked
>> load/store then the mask is computed by an assignment with a VEC_COND_EXPR
>> (in your example) and thus a test for a zero mask is expressible as just
>>
>  > if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })
>>
>> or am I missing something?
> Such vector compare produces vector and does not set up cc flags
> required for conditional branch (GIMPLE_COND).
> If we use such comparison for GIMPLE_COND we got error message, so I
> used target hook which does set up cc flags aka ptest instruction and
> I left this part.
> I moved new routines to loop-vectorizer.c file and both they are static.
> I re-wrote is_valid_sink function to use def-use chain as you proposed.
> I also add new parameter to control such transformation.
> Few redundant tests have also been deleted.
> Any comments will be appreciated.
>
> Thanks.
> Yuri.
>
> 2015-06-18  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
> (ix86_vectorize_build_zero_vector_test): New function.
> (TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST): New target macro
> * doc/tm.texi.in: Add @hook TARGET_VECTORIZE_BUILD_ZERO_VECTOR_TEST.
> * doc/tm.texi: Updated.
> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
> * target.def (build_zero_vector_test): New DEFHOOK.
> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
> has_mask_store field of vect_info.
> * tree-vectorizer.c: Include files stringpool.h and tree-ssanames.h.
> (is_valid_sink): New function.
> (optimize_mask_stores): New function.
> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
> stores.
> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
> correspondent macros.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>
>
>
> 2015-06-09 15:13 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Wed, May 20, 2015 at 4:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Hi All,
>>>
>>> Here is updated patch to optimize mask stores. The main goal of it is
>>> to avoid execution of mask store if its mask is zero vector since
>>> loads that follow it can be blocked.
>>> The following changes were done:
>>>   1.  A test on sink legality was added - it simply prohibits to cross
>>> statements with non-null vdef or vuse.
>>>   2. New phi node is created in join block for moved MASK_STORE statements.
>>>   3. Test was changed to check that 2 MASK_STORE statements are not
>>> moved to the same block.
>>>   4. New field was added to loop_vec_info structure to mark loops
>>> having MASK_STORE's.
>>>
>>> Any comments will be appreciated.
>>> Yuri.
>>
>> I still don't understand why you need the new target hook.  If we have a masked
>> load/store then the mask is computed by an assignment with a VEC_COND_EXPR
>> (in your example) and thus a test for a zero mask is expressible as just
>>
>>  if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })
>>
>> or am I missing something?  As we dont' want this transform on all targets
>> (or always) we can control it by either a --param or a new flag which default
>> targets can adjust.  [Is the hazard still present with Skylake or other
>> AVX512 implementations for example?]
>>
>> +/* Helper for optimize_mask_stores: returns true if STMT sinking to end
>> +   of BB is valid and false otherwise.  */
>> +
>> +static bool
>> +is_valid_sink (gimple stmt)
>> +{
>>
>> so STMT is either a masked store or a masked load?  You shouldn't
>> walk all stmts here but instead rely on the factored use-def def-use
>> chains of virtual operands.
>>
>> +void
>> +optimize_mask_stores (struct loop *loop)
>> +{
>> +  basic_block bb = loop->header;
>> +  gimple_stmt_iterator gsi;
>> +  gimple stmt;
>> +  auto_vec<gimple> worklist;
>> +
>> +  if (loop->dont_vectorize
>> +      || loop->num_nodes > 2)
>> +    return;
>>
>> looks like no longer needed given the place this is called from now
>> (or does it guard against outer loop vectorization as well?)
>> Also put it into tree-vect-loop.c and make it static.
>>
>> +      /* Loop was not vectorized if mask does not have vector type.  */
>> +      if (!VECTOR_TYPE_P (TREE_TYPE (mask)))
>> +       return;
>>
>> this should be always false
>>
>> +      store_bb->frequency = bb->frequency / 2;
>> +      efalse->probability = REG_BR_PROB_BASE / 2;
>>
>> I think the == 0 case should end up as unlikely.
>>
>> +      if (dom_info_available_p (CDI_POST_DOMINATORS))
>> +       set_immediate_dominator (CDI_POST_DOMINATORS, bb, store_bb);
>>
>> post dominators are not available in the vectorizer.
>>
>> +         /* Put definition statement of stored value in STORE_BB
>> +            if possible.  */
>> +         arg3 = gimple_call_arg (last, 3);
>> +         if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
>> +           {
>> +             def_stmt = SSA_NAME_DEF_STMT (arg3);
>> +             /* Move def_stmt to STORE_BB if it is in the same bb and
>> +                it is legal.  */
>> +             if (gimple_bb (def_stmt) == bb
>> +                 && is_valid_sink (def_stmt))
>>
>> ok, so you move arbitrary statements.  You can always move non-memory
>> statements and if you keep track of the last store you moved you
>> can check if gimple_vuse () is the same as on that last store and be
>> done with that, or if you sink another store (under the same condition)
>> then just update the last store.
>>
>> Otherwise this looks better now.
>>
>> Thus - get rid of the target hook and of is_valid_sink.
>>
>> Thanks,
>> Richard.
>>
>>> 2015-05-20  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>
>>> * config/i386/i386.c: Include files stringpool.h and tree-ssanames.h.
>>> (ix86_vectorize_is_zero_vector): New function.
>>> (TARGET_VECTORIZE_IS_ZERO_VECTOR): New target macro
>>> * doc/tm.texi.in: Add @hook TARGET_VECTORIZE_IS_ZERO_VECTOR.
>>> * doc/tm.texi: Updated.
>>> * target.def (is_zero_vector): New DEFHOOK.
>>> * tree-vect-stmts.c : Include tree-into-ssa.h.
>>> (vectorizable_mask_load_store): Initialize has_mask_store field.
>>> (is_valid_sink): New function.
>>> (optimize_mask_stores): New function.
>>> * tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
>>> loops having masked stores.
>>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>>> correspondent macros.
>>> (optimize_mask_stores): Update prototype.
>>>
>>> gcc/testsuite/ChangeLog:
>>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-06-18 15:41       ` Yuri Rumyantsev
  2015-07-07 13:55         ` Yuri Rumyantsev
@ 2015-07-10  5:51         ` Jeff Law
  2015-07-20 15:26           ` Yuri Rumyantsev
  1 sibling, 1 reply; 33+ messages in thread
From: Jeff Law @ 2015-07-10  5:51 UTC (permalink / raw)
  To: Yuri Rumyantsev, Richard Biener; +Cc: gcc-patches, Igor Zamyatin

On 06/18/2015 08:32 AM, Yuri Rumyantsev wrote:
> Richard,
>
> Here is updated patch which does not include your proposal related to
> the target hook deletion.
> You wrote:
>> I still don't understand why you need the new target hook.  If we have a masked
>> load/store then the mask is computed by an assignment with a VEC_COND_EXPR
>> (in your example) and thus a test for a zero mask is expressible as just
>>
>   > if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })
>>
>> or am I missing something?
> Such vector compare produces vector and does not set up cc flags
> required for conditional branch (GIMPLE_COND).
> If we use such comparison for GIMPLE_COND we got error message, so I
> used target hook which does set up cc flags aka ptest instruction and
> I left this part.
I think we need to look for something better.  I really don't like the 
idea of using a target hook like this within the gimple optimizers 
unless it's absolutely necessary.

How are conditionals on vectors usually handled?  You should try to 
mimick that and report, with detail, why that's not working.

I'm also not happy with the mechanisms to determine whether or not we 
should make this transformation.  I'm willing to hash through other 
details and come back to this issue once we've got some of the other 
stuff figured out.  I guess a new flag with the target adjusting is the 
fallback if we can't come up with some reasonable way to select this on 
or off.

The reason I don't like having the target files make this kind of 
decision is it makes more gimple transformations target dependent. 
Based on the history with RTL, that ultimately leads to an unwieldy mess.

And yes, I know gimple isn't 100% target independent -- but that doesn't 
mean we should keep adding more target dependencies.  Each one we add 
needs to be well vetted.


patch.3

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 44a8624..e90de32 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -100,6 +100,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-iterator.h"
  #include "tree-chkp.h"
  #include "rtl-chkp.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
So ideally we'll figure out why you're unable to generate a suitable 
conditional at the gimple level and the need for the x86 backend to 
generate the vector zero test will go away.  And if it does go away, we 
want those #includes to disappear.

Additionally, instead of including stringpool.h & tree-ssanames.h, 
include "ssa.h" -- as a general rule.


  static rtx legitimize_dllimport_symbol (rtx, bool);
  static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -41100,6 +41102,47 @@ ix86_vectorize_builtin_gather (const_tree 
mem_vectype,
    return ix86_get_builtin (code);
  }

+/* Returns true if SOURCE type is supported by builtin ptest.
+   NAME is lhs of created ptest call.  All created statements are added
+   to GS.  */
+
+static bool
+ix86_vectorize_build_zero_vector_test (tree source, tree name,

Given the stated goal of not doing this in the target files, I'm not 
going to actually look at this routine or any of the infrastructure for 
this target hook.


diff --git a/gcc/params.def b/gcc/params.def
index 3e4ba3a..9e8de11 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1069,6 +1069,12 @@ DEFPARAM (PARAM_MAX_STORES_TO_SINK,
            "Maximum number of conditional store pairs that can be sunk",
            2, 0, 0)

+/* Enable inserion test on zero mask for masked stores if non-zero.  */
s/inserion/insertion/

+DEFPARAM (PARAM_ZERO_TEST_FOR_STORE_MASK,
+	  "zero-test-for-store-mask",
+	  "Enable insertion of test on zero mask for masked stores",
+	  1, 0, 1)
I'm resisting the temptation to bike-shed...  I don't like the name, but 
I don't have a better one yet.  Similarly for the short description.


+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 
"vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
cleanup-tree-dump is no longer needed.


diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 7ba0d8f..e31479b 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2072,6 +2072,7 @@ vectorizable_mask_load_store (gimple stmt, 
gimple_stmt_iterator *gsi,
      {
        tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
        prev_stmt_info = NULL;
+      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
        for (i = 0; i < ncopies; i++)
If we need to keep this field, can you initialize it just after the 
STMT_VINFO_GATHER_P test after the /** Transform **/ comment.  It seems 
kind-of buried and hard to find in this location.

I'm not really sure why Richi has objected to the field.  Yea we can 
re-analyze stuff in the vectorizer, but we'd be repeating a fair number 
of tests (presumably we'd refactor the start of 
vectorizable_mask_load_store to avoid duplication).  That seems more 
wasteful than another field.



  	{
  	  unsigned align, misalign;
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index ff46a52..debb351 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -92,7 +92,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "dbgcnt.h"
  #include "gimple-fold.h"
  #include "tree-scalar-evolution.h"
-
+#include "stringpool.h"
+#include "tree-ssanames.h"
In general, rather than including stringpool.h and tree-ssanames.h, just 
include "ssa.h".



  /* Loop or bb location.  */
  source_location vect_location;
@@ -422,6 +423,184 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple 
loop_vectorized_call)
    free (bbs);
  }

+/* Helper for optimize_mask_stores: returns true if STMT sinking to end
+   of BB is valid and false otherwise.  */
+
+static bool
+is_valid_sink (gimple stmt, gimple last_store)
+{
+  tree vdef;
+  imm_use_iterator imm_it;
+  gimple use_stmt;
+  basic_block bb = gimple_bb (stmt);
+
+  if ((vdef = gimple_vdef (stmt)))
+    {
+      int cross = 0;
+      /* Check that ther are no store vuses in current bb.  */
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_it, vdef)
+	if (gimple_bb (use_stmt) == bb)
+	  cross++;
Seems to me that you should get rid of "cross" and just "return false" 
here.  There's no need to keep looking at the immediate uses once you've 
found one in the same block.  That also allows you to simplify this 
following code ...

+
+      if (cross == 0)
+	return true;
Eliminate the conditional and return true here.


+    }
+  else if (gimple_vuse (stmt) == NULL_TREE)
+    return true;
+  else if (gimple_vuse (stmt) == gimple_vuse (last_store))
+    return true;
+  return false;
+}
+
+/* The code below is trying to perform simple optimization - do not execute
+   masked store statement if its mask is zero mask since loads that follow
+   a masked store can be blocked.
+   It put all masked stores statements with the same mask into the new bb
+   with a check on zero mask.  */
I suspect you need to re-wrap the comment to 80 columns.


+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple last, def_stmt, last_store;
+      edge e, efalse;
+      tree mask, val;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      gimple_seq gs;
+      tree arg3;
+      tree vdef, new_vdef;
+      gphi *phi;
+      bool first_dump;
+
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
Are there ever cases where the mask is a compile-time constant?  I 
wouldn't think so, but I'm totally unfamiliar with all the tree-vector code.


+      val = make_ssa_name (integer_type_node);
+      gs = NULL;
+      /* Escape if target does not support test on zero mask.  */
+
+      if (!targetm.vectorize.build_zero_vector_test (mask, val, &gs))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf (MSG_NOTE, "Target does not support ptest!\n");
+	  return;
+	}
+      gcc_assert (gs != NULL);
As noted earlier, I think we need to revisit this and look for a way to 
do this test without calling into bits in the target files.  I'd like to 
see the gimple you tried to generate and any error messages that were 
spit out when you did so.




+
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
When we split BB via split_block, is the newly created block (JOIN_BB) 
automatically added to the loop?





+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      /* Don't want to put STORE_BB to unlikely part and use
+	 50% probability.  */
+      store_bb->frequency = bb->frequency / 2;
+      efalse->probability = REG_BR_PROB_BASE / 2;
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
Like Richi, I'd expect that in practice STORE_BB will be the more likely 
taken, how much so, I don't know, but 50-50 is probably not right.  I'd 
think this would affect the ultimate block layout to some degree as 
well.  Do we want the straightline path to include STORE_BB (which 
implies a branch-around STORE_BB and inverting the test from what you're 
doing now).




+      /* Create new PHI node for vdef of the last masked store:
+	 .MEM_2 = VDEF <.MEM_1>
+	 will be converted to
+	 .MEM.3 = VDEF <.MEM_1>
+	 and new PHI node will be created in join bb
+	 .MEM_2 = PHI <.MEM_1, .MEM_3>
+      */
+      vdef = gimple_vdef (last);
+      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
+      new_vdef = make_ssa_name (gimple_vop (cfun), last);
+      phi = create_phi_node (vdef, join_bb);
+      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), 
UNKNOWN_LOCATION);
+      gimple_set_vdef (last, new_vdef);
+      first_dump = true;
Presumably you don't add the phi arg associated with the edge 
BB->JOIN_BB because you don't know it until the end of the loop below, 
right?


Overall I think this is pretty close.  I'd like you to focus on getting 
the test for a zero store mask sorted out.

Jeff


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-10  5:51         ` Jeff Law
@ 2015-07-20 15:26           ` Yuri Rumyantsev
  2015-07-21 13:59             ` Richard Biener
  2015-07-23 20:32             ` Jeff Law
  0 siblings, 2 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-07-20 15:26 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, gcc-patches, Igor Zamyatin

Hi Jeff!

Thanks for your details comments.

You asked:
How are conditionals on vectors usually handled?  You should try to
mimick that and report, with detail, why that's not working.

In current implementation of vectorization pass all comparisons are
handled through COND_EXPR, i.e. vect-pattern pass transforms all
comparisons producing bool values to conditional expressions like a[i]
!= 0  --> a[i]!=0? 1: 0 which vectorizers transforms to
vect-cond-expr. So we don't have operations with vector operands and
scalar (bool) result.
To implement such operations I introduced target-hook. Could you
propose another solution implementing it?

Thanks.
Yuri.

2015-07-10 8:51 GMT+03:00 Jeff Law <law@redhat.com>:
> On 06/18/2015 08:32 AM, Yuri Rumyantsev wrote:
>>
>> Richard,
>>
>> Here is updated patch which does not include your proposal related to
>> the target hook deletion.
>> You wrote:
>>>
>>> I still don't understand why you need the new target hook.  If we have a
>>> masked
>>> load/store then the mask is computed by an assignment with a
>>> VEC_COND_EXPR
>>> (in your example) and thus a test for a zero mask is expressible as just
>>>
>>   > if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })
>>>
>>>
>>> or am I missing something?
>>
>> Such vector compare produces vector and does not set up cc flags
>> required for conditional branch (GIMPLE_COND).
>> If we use such comparison for GIMPLE_COND we got error message, so I
>> used target hook which does set up cc flags aka ptest instruction and
>> I left this part.
>
> I think we need to look for something better.  I really don't like the idea
> of using a target hook like this within the gimple optimizers unless it's
> absolutely necessary.
>
> How are conditionals on vectors usually handled?  You should try to mimick
> that and report, with detail, why that's not working.
>
> I'm also not happy with the mechanisms to determine whether or not we should
> make this transformation.  I'm willing to hash through other details and
> come back to this issue once we've got some of the other stuff figured out.
> I guess a new flag with the target adjusting is the fallback if we can't
> come up with some reasonable way to select this on or off.
>
> The reason I don't like having the target files make this kind of decision
> is it makes more gimple transformations target dependent. Based on the
> history with RTL, that ultimately leads to an unwieldy mess.
>
> And yes, I know gimple isn't 100% target independent -- but that doesn't
> mean we should keep adding more target dependencies.  Each one we add needs
> to be well vetted.
>
>
> patch.3
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 44a8624..e90de32 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -100,6 +100,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-iterator.h"
>  #include "tree-chkp.h"
>  #include "rtl-chkp.h"
> +#include "stringpool.h"
> +#include "tree-ssanames.h"
> So ideally we'll figure out why you're unable to generate a suitable
> conditional at the gimple level and the need for the x86 backend to generate
> the vector zero test will go away.  And if it does go away, we want those
> #includes to disappear.
>
> Additionally, instead of including stringpool.h & tree-ssanames.h, include
> "ssa.h" -- as a general rule.
>
>
>  static rtx legitimize_dllimport_symbol (rtx, bool);
>  static rtx legitimize_pe_coff_extern_decl (rtx, bool);
> @@ -41100,6 +41102,47 @@ ix86_vectorize_builtin_gather (const_tree
> mem_vectype,
>    return ix86_get_builtin (code);
>  }
>
> +/* Returns true if SOURCE type is supported by builtin ptest.
> +   NAME is lhs of created ptest call.  All created statements are added
> +   to GS.  */
> +
> +static bool
> +ix86_vectorize_build_zero_vector_test (tree source, tree name,
>
> Given the stated goal of not doing this in the target files, I'm not going
> to actually look at this routine or any of the infrastructure for this
> target hook.
>
>
> diff --git a/gcc/params.def b/gcc/params.def
> index 3e4ba3a..9e8de11 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -1069,6 +1069,12 @@ DEFPARAM (PARAM_MAX_STORES_TO_SINK,
>            "Maximum number of conditional store pairs that can be sunk",
>            2, 0, 0)
>
> +/* Enable inserion test on zero mask for masked stores if non-zero.  */
> s/inserion/insertion/
>
> +DEFPARAM (PARAM_ZERO_TEST_FOR_STORE_MASK,
> +         "zero-test-for-store-mask",
> +         "Enable insertion of test on zero mask for masked stores",
> +         1, 0, 1)
> I'm resisting the temptation to bike-shed...  I don't like the name, but I
> don't have a better one yet.  Similarly for the short description.
>
>
> +/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 "vect" }
> } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> cleanup-tree-dump is no longer needed.
>
>
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 7ba0d8f..e31479b 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -2072,6 +2072,7 @@ vectorizable_mask_load_store (gimple stmt,
> gimple_stmt_iterator *gsi,
>      {
>        tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
>        prev_stmt_info = NULL;
> +      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
>        for (i = 0; i < ncopies; i++)
> If we need to keep this field, can you initialize it just after the
> STMT_VINFO_GATHER_P test after the /** Transform **/ comment.  It seems
> kind-of buried and hard to find in this location.
>
> I'm not really sure why Richi has objected to the field.  Yea we can
> re-analyze stuff in the vectorizer, but we'd be repeating a fair number of
> tests (presumably we'd refactor the start of vectorizable_mask_load_store to
> avoid duplication).  That seems more wasteful than another field.
>
>
>
>         {
>           unsigned align, misalign;
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index ff46a52..debb351 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -92,7 +92,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "dbgcnt.h"
>  #include "gimple-fold.h"
>  #include "tree-scalar-evolution.h"
> -
> +#include "stringpool.h"
> +#include "tree-ssanames.h"
> In general, rather than including stringpool.h and tree-ssanames.h, just
> include "ssa.h".
>
>
>
>  /* Loop or bb location.  */
>  source_location vect_location;
> @@ -422,6 +423,184 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple
> loop_vectorized_call)
>    free (bbs);
>  }
>
> +/* Helper for optimize_mask_stores: returns true if STMT sinking to end
> +   of BB is valid and false otherwise.  */
> +
> +static bool
> +is_valid_sink (gimple stmt, gimple last_store)
> +{
> +  tree vdef;
> +  imm_use_iterator imm_it;
> +  gimple use_stmt;
> +  basic_block bb = gimple_bb (stmt);
> +
> +  if ((vdef = gimple_vdef (stmt)))
> +    {
> +      int cross = 0;
> +      /* Check that ther are no store vuses in current bb.  */
> +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_it, vdef)
> +       if (gimple_bb (use_stmt) == bb)
> +         cross++;
> Seems to me that you should get rid of "cross" and just "return false" here.
> There's no need to keep looking at the immediate uses once you've found one
> in the same block.  That also allows you to simplify this following code ...
>
> +
> +      if (cross == 0)
> +       return true;
> Eliminate the conditional and return true here.
>
>
> +    }
> +  else if (gimple_vuse (stmt) == NULL_TREE)
> +    return true;
> +  else if (gimple_vuse (stmt) == gimple_vuse (last_store))
> +    return true;
> +  return false;
> +}
> +
> +/* The code below is trying to perform simple optimization - do not execute
> +   masked store statement if its mask is zero mask since loads that follow
> +   a masked store can be blocked.
> +   It put all masked stores statements with the same mask into the new bb
> +   with a check on zero mask.  */
> I suspect you need to re-wrap the comment to 80 columns.
>
>
> +
> +  /* Loop has masked stores.  */
> +  while (!worklist.is_empty ())
> +    {
> +      gimple last, def_stmt, last_store;
> +      edge e, efalse;
> +      tree mask, val;
> +      basic_block store_bb, join_bb;
> +      gimple_stmt_iterator gsi_to;
> +      gimple_seq gs;
> +      tree arg3;
> +      tree vdef, new_vdef;
> +      gphi *phi;
> +      bool first_dump;
> +
> +      last = worklist.pop ();
> +      mask = gimple_call_arg (last, 2);
> Are there ever cases where the mask is a compile-time constant?  I wouldn't
> think so, but I'm totally unfamiliar with all the tree-vector code.
>
>
> +      val = make_ssa_name (integer_type_node);
> +      gs = NULL;
> +      /* Escape if target does not support test on zero mask.  */
> +
> +      if (!targetm.vectorize.build_zero_vector_test (mask, val, &gs))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf (MSG_NOTE, "Target does not support ptest!\n");
> +         return;
> +       }
> +      gcc_assert (gs != NULL);
> As noted earlier, I think we need to revisit this and look for a way to do
> this test without calling into bits in the target files.  I'd like to see
> the gimple you tried to generate and any error messages that were spit out
> when you did so.
>
>
>
>
> +
> +      /* Create new bb.  */
> +      e = split_block (bb, last);
> +      join_bb = e->dest;
> +      store_bb = create_empty_bb (bb);
> +      add_bb_to_loop (store_bb, loop);
> When we split BB via split_block, is the newly created block (JOIN_BB)
> automatically added to the loop?
>
>
>
>
>
> +      e->flags = EDGE_TRUE_VALUE;
> +      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
> +      /* Don't want to put STORE_BB to unlikely part and use
> +        50% probability.  */
> +      store_bb->frequency = bb->frequency / 2;
> +      efalse->probability = REG_BR_PROB_BASE / 2;
> +      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
> Like Richi, I'd expect that in practice STORE_BB will be the more likely
> taken, how much so, I don't know, but 50-50 is probably not right.  I'd
> think this would affect the ultimate block layout to some degree as well.
> Do we want the straightline path to include STORE_BB (which implies a
> branch-around STORE_BB and inverting the test from what you're doing now).
>
>
>
>
> +      /* Create new PHI node for vdef of the last masked store:
> +        .MEM_2 = VDEF <.MEM_1>
> +        will be converted to
> +        .MEM.3 = VDEF <.MEM_1>
> +        and new PHI node will be created in join bb
> +        .MEM_2 = PHI <.MEM_1, .MEM_3>
> +      */
> +      vdef = gimple_vdef (last);
> +      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
> +      new_vdef = make_ssa_name (gimple_vop (cfun), last);
> +      phi = create_phi_node (vdef, join_bb);
> +      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0),
> UNKNOWN_LOCATION);
> +      gimple_set_vdef (last, new_vdef);
> +      first_dump = true;
> Presumably you don't add the phi arg associated with the edge BB->JOIN_BB
> because you don't know it until the end of the loop below, right?
>
>
> Overall I think this is pretty close.  I'd like you to focus on getting the
> test for a zero store mask sorted out.
>
> Jeff
>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-20 15:26           ` Yuri Rumyantsev
@ 2015-07-21 13:59             ` Richard Biener
  2015-07-23 20:32             ` Jeff Law
  1 sibling, 0 replies; 33+ messages in thread
From: Richard Biener @ 2015-07-21 13:59 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

On Mon, Jul 20, 2015 at 5:05 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi Jeff!
>
> Thanks for your details comments.
>
> You asked:
> How are conditionals on vectors usually handled?  You should try to
> mimick that and report, with detail, why that's not working.
>
> In current implementation of vectorization pass all comparisons are
> handled through COND_EXPR, i.e. vect-pattern pass transforms all
> comparisons producing bool values to conditional expressions like a[i]
> != 0  --> a[i]!=0? 1: 0 which vectorizers transforms to
> vect-cond-expr. So we don't have operations with vector operands and
> scalar (bool) result.
> To implement such operations I introduced target-hook. Could you
> propose another solution implementing it?

see below...

> Thanks.
> Yuri.
>
> 2015-07-10 8:51 GMT+03:00 Jeff Law <law@redhat.com>:
>> On 06/18/2015 08:32 AM, Yuri Rumyantsev wrote:
>>>
>>> Richard,
>>>
>>> Here is updated patch which does not include your proposal related to
>>> the target hook deletion.
>>> You wrote:
>>>>
>>>> I still don't understand why you need the new target hook.  If we have a
>>>> masked
>>>> load/store then the mask is computed by an assignment with a
>>>> VEC_COND_EXPR
>>>> (in your example) and thus a test for a zero mask is expressible as just
>>>>
>>>   > if (vect__ifc__41.17_167 == { 0, 0, 0, 0... })
>>>>
>>>>
>>>> or am I missing something?
>>>
>>> Such vector compare produces vector and does not set up cc flags
>>> required for conditional branch (GIMPLE_COND).
>>> If we use such comparison for GIMPLE_COND we got error message, so I
>>> used target hook which does set up cc flags aka ptest instruction and
>>> I left this part.

... what error message did you get?

I suppose you ran into IL checking complaining about

error: vector comparison returning a boolean

?

It looks like we forbade equality compares for consistency thus you'd need
to use

  vec_ifc_tem_4 = VIEW_CONVERT_EXPR<TImode> (vect_ifc_41.17_167);
  if (vec_ifc_tem_4 == 0)
    ...

instead.   Thus view-convert to an integer mode of the same size as the vector.

Or we allow equality compares of vectors.  Not sure if expansion /
targets handle
them well though - you'd have to check.

Richard.

>> I think we need to look for something better.  I really don't like the idea
>> of using a target hook like this within the gimple optimizers unless it's
>> absolutely necessary.
>>
>> How are conditionals on vectors usually handled?  You should try to mimick
>> that and report, with detail, why that's not working.
>>
>> I'm also not happy with the mechanisms to determine whether or not we should
>> make this transformation.  I'm willing to hash through other details and
>> come back to this issue once we've got some of the other stuff figured out.
>> I guess a new flag with the target adjusting is the fallback if we can't
>> come up with some reasonable way to select this on or off.
>>
>> The reason I don't like having the target files make this kind of decision
>> is it makes more gimple transformations target dependent. Based on the
>> history with RTL, that ultimately leads to an unwieldy mess.
>>
>> And yes, I know gimple isn't 100% target independent -- but that doesn't
>> mean we should keep adding more target dependencies.  Each one we add needs
>> to be well vetted.
>>
>>
>> patch.3
>>
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 44a8624..e90de32 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -100,6 +100,8 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "tree-iterator.h"
>>  #include "tree-chkp.h"
>>  #include "rtl-chkp.h"
>> +#include "stringpool.h"
>> +#include "tree-ssanames.h"
>> So ideally we'll figure out why you're unable to generate a suitable
>> conditional at the gimple level and the need for the x86 backend to generate
>> the vector zero test will go away.  And if it does go away, we want those
>> #includes to disappear.
>>
>> Additionally, instead of including stringpool.h & tree-ssanames.h, include
>> "ssa.h" -- as a general rule.
>>
>>
>>  static rtx legitimize_dllimport_symbol (rtx, bool);
>>  static rtx legitimize_pe_coff_extern_decl (rtx, bool);
>> @@ -41100,6 +41102,47 @@ ix86_vectorize_builtin_gather (const_tree
>> mem_vectype,
>>    return ix86_get_builtin (code);
>>  }
>>
>> +/* Returns true if SOURCE type is supported by builtin ptest.
>> +   NAME is lhs of created ptest call.  All created statements are added
>> +   to GS.  */
>> +
>> +static bool
>> +ix86_vectorize_build_zero_vector_test (tree source, tree name,
>>
>> Given the stated goal of not doing this in the target files, I'm not going
>> to actually look at this routine or any of the infrastructure for this
>> target hook.
>>
>>
>> diff --git a/gcc/params.def b/gcc/params.def
>> index 3e4ba3a..9e8de11 100644
>> --- a/gcc/params.def
>> +++ b/gcc/params.def
>> @@ -1069,6 +1069,12 @@ DEFPARAM (PARAM_MAX_STORES_TO_SINK,
>>            "Maximum number of conditional store pairs that can be sunk",
>>            2, 0, 0)
>>
>> +/* Enable inserion test on zero mask for masked stores if non-zero.  */
>> s/inserion/insertion/
>>
>> +DEFPARAM (PARAM_ZERO_TEST_FOR_STORE_MASK,
>> +         "zero-test-for-store-mask",
>> +         "Enable insertion of test on zero mask for masked stores",
>> +         1, 0, 1)
>> I'm resisting the temptation to bike-shed...  I don't like the name, but I
>> don't have a better one yet.  Similarly for the short description.
>>
>>
>> +/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 "vect" }
>> } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>> cleanup-tree-dump is no longer needed.
>>
>>
>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>> index 7ba0d8f..e31479b 100644
>> --- a/gcc/tree-vect-stmts.c
>> +++ b/gcc/tree-vect-stmts.c
>> @@ -2072,6 +2072,7 @@ vectorizable_mask_load_store (gimple stmt,
>> gimple_stmt_iterator *gsi,
>>      {
>>        tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
>>        prev_stmt_info = NULL;
>> +      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
>>        for (i = 0; i < ncopies; i++)
>> If we need to keep this field, can you initialize it just after the
>> STMT_VINFO_GATHER_P test after the /** Transform **/ comment.  It seems
>> kind-of buried and hard to find in this location.
>>
>> I'm not really sure why Richi has objected to the field.  Yea we can
>> re-analyze stuff in the vectorizer, but we'd be repeating a fair number of
>> tests (presumably we'd refactor the start of vectorizable_mask_load_store to
>> avoid duplication).  That seems more wasteful than another field.
>>
>>
>>
>>         {
>>           unsigned align, misalign;
>> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
>> index ff46a52..debb351 100644
>> --- a/gcc/tree-vectorizer.c
>> +++ b/gcc/tree-vectorizer.c
>> @@ -92,7 +92,8 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "dbgcnt.h"
>>  #include "gimple-fold.h"
>>  #include "tree-scalar-evolution.h"
>> -
>> +#include "stringpool.h"
>> +#include "tree-ssanames.h"
>> In general, rather than including stringpool.h and tree-ssanames.h, just
>> include "ssa.h".
>>
>>
>>
>>  /* Loop or bb location.  */
>>  source_location vect_location;
>> @@ -422,6 +423,184 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple
>> loop_vectorized_call)
>>    free (bbs);
>>  }
>>
>> +/* Helper for optimize_mask_stores: returns true if STMT sinking to end
>> +   of BB is valid and false otherwise.  */
>> +
>> +static bool
>> +is_valid_sink (gimple stmt, gimple last_store)
>> +{
>> +  tree vdef;
>> +  imm_use_iterator imm_it;
>> +  gimple use_stmt;
>> +  basic_block bb = gimple_bb (stmt);
>> +
>> +  if ((vdef = gimple_vdef (stmt)))
>> +    {
>> +      int cross = 0;
>> +      /* Check that ther are no store vuses in current bb.  */
>> +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_it, vdef)
>> +       if (gimple_bb (use_stmt) == bb)
>> +         cross++;
>> Seems to me that you should get rid of "cross" and just "return false" here.
>> There's no need to keep looking at the immediate uses once you've found one
>> in the same block.  That also allows you to simplify this following code ...
>>
>> +
>> +      if (cross == 0)
>> +       return true;
>> Eliminate the conditional and return true here.
>>
>>
>> +    }
>> +  else if (gimple_vuse (stmt) == NULL_TREE)
>> +    return true;
>> +  else if (gimple_vuse (stmt) == gimple_vuse (last_store))
>> +    return true;
>> +  return false;
>> +}
>> +
>> +/* The code below is trying to perform simple optimization - do not execute
>> +   masked store statement if its mask is zero mask since loads that follow
>> +   a masked store can be blocked.
>> +   It put all masked stores statements with the same mask into the new bb
>> +   with a check on zero mask.  */
>> I suspect you need to re-wrap the comment to 80 columns.
>>
>>
>> +
>> +  /* Loop has masked stores.  */
>> +  while (!worklist.is_empty ())
>> +    {
>> +      gimple last, def_stmt, last_store;
>> +      edge e, efalse;
>> +      tree mask, val;
>> +      basic_block store_bb, join_bb;
>> +      gimple_stmt_iterator gsi_to;
>> +      gimple_seq gs;
>> +      tree arg3;
>> +      tree vdef, new_vdef;
>> +      gphi *phi;
>> +      bool first_dump;
>> +
>> +      last = worklist.pop ();
>> +      mask = gimple_call_arg (last, 2);
>> Are there ever cases where the mask is a compile-time constant?  I wouldn't
>> think so, but I'm totally unfamiliar with all the tree-vector code.
>>
>>
>> +      val = make_ssa_name (integer_type_node);
>> +      gs = NULL;
>> +      /* Escape if target does not support test on zero mask.  */
>> +
>> +      if (!targetm.vectorize.build_zero_vector_test (mask, val, &gs))
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf (MSG_NOTE, "Target does not support ptest!\n");
>> +         return;
>> +       }
>> +      gcc_assert (gs != NULL);
>> As noted earlier, I think we need to revisit this and look for a way to do
>> this test without calling into bits in the target files.  I'd like to see
>> the gimple you tried to generate and any error messages that were spit out
>> when you did so.
>>
>>
>>
>>
>> +
>> +      /* Create new bb.  */
>> +      e = split_block (bb, last);
>> +      join_bb = e->dest;
>> +      store_bb = create_empty_bb (bb);
>> +      add_bb_to_loop (store_bb, loop);
>> When we split BB via split_block, is the newly created block (JOIN_BB)
>> automatically added to the loop?
>>
>>
>>
>>
>>
>> +      e->flags = EDGE_TRUE_VALUE;
>> +      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
>> +      /* Don't want to put STORE_BB to unlikely part and use
>> +        50% probability.  */
>> +      store_bb->frequency = bb->frequency / 2;
>> +      efalse->probability = REG_BR_PROB_BASE / 2;
>> +      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
>> Like Richi, I'd expect that in practice STORE_BB will be the more likely
>> taken, how much so, I don't know, but 50-50 is probably not right.  I'd
>> think this would affect the ultimate block layout to some degree as well.
>> Do we want the straightline path to include STORE_BB (which implies a
>> branch-around STORE_BB and inverting the test from what you're doing now).
>>
>>
>>
>>
>> +      /* Create new PHI node for vdef of the last masked store:
>> +        .MEM_2 = VDEF <.MEM_1>
>> +        will be converted to
>> +        .MEM.3 = VDEF <.MEM_1>
>> +        and new PHI node will be created in join bb
>> +        .MEM_2 = PHI <.MEM_1, .MEM_3>
>> +      */
>> +      vdef = gimple_vdef (last);
>> +      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
>> +      new_vdef = make_ssa_name (gimple_vop (cfun), last);
>> +      phi = create_phi_node (vdef, join_bb);
>> +      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0),
>> UNKNOWN_LOCATION);
>> +      gimple_set_vdef (last, new_vdef);
>> +      first_dump = true;
>> Presumably you don't add the phi arg associated with the edge BB->JOIN_BB
>> because you don't know it until the end of the loop below, right?
>>
>>
>> Overall I think this is pretty close.  I'd like you to focus on getting the
>> test for a zero store mask sorted out.
>>
>> Jeff
>>
>>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-20 15:26           ` Yuri Rumyantsev
  2015-07-21 13:59             ` Richard Biener
@ 2015-07-23 20:32             ` Jeff Law
  2015-07-24  9:04               ` Yuri Rumyantsev
  2015-07-24  9:24               ` Richard Biener
  1 sibling, 2 replies; 33+ messages in thread
From: Jeff Law @ 2015-07-23 20:32 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Richard Biener, gcc-patches, Igor Zamyatin

On 07/20/2015 09:05 AM, Yuri Rumyantsev wrote:
> Hi Jeff!
>
> Thanks for your details comments.
>
> You asked:
> How are conditionals on vectors usually handled?  You should try to
> mimick that and report, with detail, why that's not working.
>
> In current implementation of vectorization pass all comparisons are
> handled through COND_EXPR, i.e. vect-pattern pass transforms all
> comparisons producing bool values to conditional expressions like a[i]
> != 0  --> a[i]!=0? 1: 0 which vectorizers transforms to
> vect-cond-expr. So we don't have operations with vector operands and
> scalar (bool) result.
> To implement such operations I introduced target-hook. Could you
> propose another solution implementing it?
Is there any rationale given anywhere for the transformation into 
conditional expressions?  ie, is there any reason why we can't have a 
GIMPLE_COND where the expression is a vector condition?

Thanks,

Jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-23 20:32             ` Jeff Law
@ 2015-07-24  9:04               ` Yuri Rumyantsev
  2015-07-24  9:24               ` Richard Biener
  1 sibling, 0 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-07-24  9:04 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, gcc-patches, Igor Zamyatin

Jeff,

The goal of this transformation is to not execute masked store if
possible,i.e. when is is equal to zero-vector. and conditional
expression is not suitable for it - we must generate semi-hammock with
conditional branch.

Yuri.

2015-07-23 23:03 GMT+03:00 Jeff Law <law@redhat.com>:
> On 07/20/2015 09:05 AM, Yuri Rumyantsev wrote:
>>
>> Hi Jeff!
>>
>> Thanks for your details comments.
>>
>> You asked:
>> How are conditionals on vectors usually handled?  You should try to
>> mimick that and report, with detail, why that's not working.
>>
>> In current implementation of vectorization pass all comparisons are
>> handled through COND_EXPR, i.e. vect-pattern pass transforms all
>> comparisons producing bool values to conditional expressions like a[i]
>> != 0  --> a[i]!=0? 1: 0 which vectorizers transforms to
>> vect-cond-expr. So we don't have operations with vector operands and
>> scalar (bool) result.
>> To implement such operations I introduced target-hook. Could you
>> propose another solution implementing it?
>
> Is there any rationale given anywhere for the transformation into
> conditional expressions?  ie, is there any reason why we can't have a
> GIMPLE_COND where the expression is a vector condition?
>
> Thanks,
>
> Jeff
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-23 20:32             ` Jeff Law
  2015-07-24  9:04               ` Yuri Rumyantsev
@ 2015-07-24  9:24               ` Richard Biener
  2015-07-24 19:26                 ` Jeff Law
  1 sibling, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-07-24  9:24 UTC (permalink / raw)
  To: Jeff Law; +Cc: Yuri Rumyantsev, gcc-patches, Igor Zamyatin

On Thu, Jul 23, 2015 at 10:03 PM, Jeff Law <law@redhat.com> wrote:
> On 07/20/2015 09:05 AM, Yuri Rumyantsev wrote:
>>
>> Hi Jeff!
>>
>> Thanks for your details comments.
>>
>> You asked:
>> How are conditionals on vectors usually handled?  You should try to
>> mimick that and report, with detail, why that's not working.
>>
>> In current implementation of vectorization pass all comparisons are
>> handled through COND_EXPR, i.e. vect-pattern pass transforms all
>> comparisons producing bool values to conditional expressions like a[i]
>> != 0  --> a[i]!=0? 1: 0 which vectorizers transforms to
>> vect-cond-expr. So we don't have operations with vector operands and
>> scalar (bool) result.
>> To implement such operations I introduced target-hook. Could you
>> propose another solution implementing it?
>
> Is there any rationale given anywhere for the transformation into
> conditional expressions?  ie, is there any reason why we can't have a
> GIMPLE_COND where the expression is a vector condition?

No rationale for equality compare which would have the semantic of
having all elements equal or not equal.  But you can't define a sensible
ordering (that HW implements) for other compare operators and you
obviously need a single boolean result, not a vector of element comparison
results.

I've already replied that I'm fine allowing ==/!= whole-vector compares.
But one needs to check whether expansion does anything sensible
with them (either expand to integer subreg compares or add optabs
for the compares).

Richard.

> Thanks,
>
> Jeff
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-24  9:24               ` Richard Biener
@ 2015-07-24 19:26                 ` Jeff Law
  2015-07-27  9:04                   ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Law @ 2015-07-24 19:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: Yuri Rumyantsev, gcc-patches, Igor Zamyatin

On 07/24/2015 03:16 AM, Richard Biener wrote:
>> Is there any rationale given anywhere for the transformation into
>> conditional expressions?  ie, is there any reason why we can't have a
>> GIMPLE_COND where the expression is a vector condition?
>
> No rationale for equality compare which would have the semantic of
> having all elements equal or not equal.  But you can't define a sensible
> ordering (that HW implements) for other compare operators and you
> obviously need a single boolean result, not a vector of element comparison
> results.
Right.  EQ/NE only as others just don't have any real meaning.


> I've already replied that I'm fine allowing ==/!= whole-vector compares.
> But one needs to check whether expansion does anything sensible
> with them (either expand to integer subreg compares or add optabs
> for the compares).
Agreed, EQ/NE for whole vector compares only would be fine for me too 
under the same conditions.

jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-24 19:26                 ` Jeff Law
@ 2015-07-27  9:04                   ` Richard Biener
  2015-08-06 11:07                     ` Yuri Rumyantsev
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-07-27  9:04 UTC (permalink / raw)
  To: Jeff Law; +Cc: Yuri Rumyantsev, gcc-patches, Igor Zamyatin

On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>
>>> Is there any rationale given anywhere for the transformation into
>>> conditional expressions?  ie, is there any reason why we can't have a
>>> GIMPLE_COND where the expression is a vector condition?
>>
>>
>> No rationale for equality compare which would have the semantic of
>> having all elements equal or not equal.  But you can't define a sensible
>> ordering (that HW implements) for other compare operators and you
>> obviously need a single boolean result, not a vector of element comparison
>> results.
>
> Right.  EQ/NE only as others just don't have any real meaning.
>
>
>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>> But one needs to check whether expansion does anything sensible
>> with them (either expand to integer subreg compares or add optabs
>> for the compares).
>
> Agreed, EQ/NE for whole vector compares only would be fine for me too under
> the same conditions.

Btw, you can already do this on GIMPLE by doing

  TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
  if (vec_as_int == 0)
...

which is what the RTL will look like in the end.  So not sure if making this
higher-level in GIMPLE is good or required.

Richard.

> jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-07-27  9:04                   ` Richard Biener
@ 2015-08-06 11:07                     ` Yuri Rumyantsev
  2015-08-13 11:40                       ` Yuri Rumyantsev
  0 siblings, 1 reply; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-08-06 11:07 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 2870 bytes --]

HI All,

Here is updated patch which implements Richard proposal to use vector
comparison with boolean result instead of target hook. Support for it
was added to ix86_expand_branch.

Any comments will be appreciated.

Bootstrap and regression testing did not show any new failures.

ChangeLog:
2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>

* config/i386/i386.c (ix86_expand_branch): Implement vector
comparison with boolean result.
* config/i386/sse.md (define_expand "cbranch<mode>4): Add define
for vector comparion.
* fold-const.c (fold_relational_const): Add handling of vector
comparison with boolean result.
* params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
* params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
* tree-cfg.c (verify_gimple_comparison): Add test for vector
comparion with boolean result.
* tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
propagate vector comparion with boolean result.
* tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
has_mask_store field of vect_info.
* tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
(is_valid_sink): New function.
(optimize_mask_stores): New function.
(vectorize_loops): Invoke optimaze_mask_stores for loops having masked
stores.
* tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
correspondent macros.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.


2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>
>>>> Is there any rationale given anywhere for the transformation into
>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>> GIMPLE_COND where the expression is a vector condition?
>>>
>>>
>>> No rationale for equality compare which would have the semantic of
>>> having all elements equal or not equal.  But you can't define a sensible
>>> ordering (that HW implements) for other compare operators and you
>>> obviously need a single boolean result, not a vector of element comparison
>>> results.
>>
>> Right.  EQ/NE only as others just don't have any real meaning.
>>
>>
>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>> But one needs to check whether expansion does anything sensible
>>> with them (either expand to integer subreg compares or add optabs
>>> for the compares).
>>
>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>> the same conditions.
>
> Btw, you can already do this on GIMPLE by doing
>
>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>   if (vec_as_int == 0)
> ...
>
> which is what the RTL will look like in the end.  So not sure if making this
> higher-level in GIMPLE is good or required.
>
> Richard.
>
>> jeff

[-- Attachment #2: patch.4 --]
[-- Type: application/octet-stream, Size: 15500 bytes --]

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7901a4f..7c1916c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20522,6 +20522,41 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   machine_mode mode = GET_MODE (op0);
   rtx tmp;
 
+  /* Handle special case - vector comparsion with boolean result, transform
+     it using ptest instruction.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      rtx lhs;
+      rtx flag;
+      machine_mode ptest_mode = GET_MODE_SIZE (mode) == 32? V4DImode: V2DImode;
+      gcc_assert (code == EQ || code == NE);
+      if (!REG_P (op0))
+	op0 = force_reg (mode, op0);
+      if (!REG_P (op1))
+	op1 = force_reg (mode, op1);
+      if (op0 != op1)
+	{
+	  lhs = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (lhs,
+				  gen_rtx_MINUS (mode, op0, op1)));
+	}
+      else
+	lhs = op0;
+      lhs = gen_rtx_SUBREG (ptest_mode, lhs, 0);
+      tmp = gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
+			  gen_rtx_UNSPEC (CCmode,
+					  gen_rtvec (2, lhs, lhs),
+					  UNSPEC_PTEST));
+      emit_insn (tmp);
+      flag = gen_rtx_REG (CCZmode, FLAGS_REG);
+      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+				  gen_rtx_LABEL_REF (VOIDmode, label),
+				  pc_rtx);
+      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+      return;
+    }
+
   switch (mode)
     {
     case SFmode:
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 4bee9fa..4bcf24e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17903,6 +17903,24 @@
 	  UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
+(define_expand "cbranch<mode>4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:V48_AVX2 1 "nonimmediate_operand")
+		    (match_operand:V48_AVX2 2 "register_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "ordered_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_AVX2"
+{
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>"
   [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
 	(unspec:AVX256MODE2P
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index fa321f4..2be9c98 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -14411,8 +14411,29 @@ fold_relational_const (enum tree_code code, tree type, tree op0, tree op1)
 
   if (TREE_CODE (op0) == VECTOR_CST && TREE_CODE (op1) == VECTOR_CST)
     {
+      if (INTEGRAL_TYPE_P (type)
+	  && (TREE_CODE (type) == BOOLEAN_TYPE
+	      || TYPE_PRECISION (type) == 1))
+	{
+	  /* Have vector comparison with boolean result.  */
+	  bool result = true;
+	  gcc_assert (code == EQ_EXPR || code == NE_EXPR);
+	  gcc_assert (VECTOR_CST_NELTS (op0) == VECTOR_CST_NELTS (op1));
+	  for (unsigned i = 0; i < VECTOR_CST_NELTS (op0); i++)
+	    {
+	      tree elem0 = VECTOR_CST_ELT (op0, i);
+	      tree elem1 = VECTOR_CST_ELT (op1, i);
+	      tree tmp = fold_relational_const (code, type, elem0, elem1);
+	      result &= integer_onep (tmp);
+	    }
+	  if (code == NE_EXPR)
+	    result = !result;
+	  return constant_boolean_node (result, type);
+	}
       unsigned count = VECTOR_CST_NELTS (op0);
       tree *elts =  XALLOCAVEC (tree, count);
+      if (!VECTOR_TYPE_P (type))
+	return NULL_TREE;
       gcc_assert (VECTOR_CST_NELTS (op1) == count
 		  && TYPE_VECTOR_SUBPARTS (type) == count);
 
diff --git a/gcc/params.def b/gcc/params.def
index 0ca3451..77fc950 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1034,6 +1034,12 @@ DEFPARAM (PARAM_MAX_STORES_TO_SINK,
           "Maximum number of conditional store pairs that can be sunk",
           2, 0, 0)
 
+/* Enable inserion test on zero mask for masked stores if non-zero.  */
+DEFPARAM (PARAM_ZERO_TEST_FOR_STORE_MASK,
+	  "zero-test-for-store-mask",
+	  "Enable insertion of test on zero mask for masked stores",
+	  1, 0, 1)
+
 /* Override CASE_VALUES_THRESHOLD of when to switch from doing switch
    statements via if statements to using a table jump operation.  If the value
    is 0, the default CASE_VALUES_THRESHOLD will be used.  */
diff --git a/gcc/params.h b/gcc/params.h
index f53426d..a3ea081 100644
--- a/gcc/params.h
+++ b/gcc/params.h
@@ -218,6 +218,8 @@ extern void init_param_values (int *params);
   PARAM_VALUE (PARAM_MIN_NONDEBUG_INSN_UID)
 #define MAX_STORES_TO_SINK \
   PARAM_VALUE (PARAM_MAX_STORES_TO_SINK)
+#define ENABLE_ZERO_TEST_FOR_STORE_MASK \
+  PARAM_VALUE (PARAM_ZERO_TEST_FOR_STORE_MASK)
 #define ALLOW_LOAD_DATA_RACES \
   PARAM_VALUE (PARAM_ALLOW_LOAD_DATA_RACES)
 #define ALLOW_STORE_DATA_RACES \
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
new file mode 100644
index 0000000..8f6bd1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-mavx2 -O3 -fdump-tree-vect-details" } */
+
+extern int *p1, *p2, *p3;
+int c[256];
+void foo (int n)
+{
+  int i;
+  for (i=0; i<n; i++)
+    if (c[i])
+      {
+	p1[i] += 1;
+	p2[i] = p3[i] +2;
+      }
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index d97b824..5e923ed 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -3495,12 +3495,21 @@ verify_gimple_comparison (tree type, tree op0, tree op1)
       if (TREE_CODE (op0_type) == VECTOR_TYPE
 	  || TREE_CODE (op1_type) == VECTOR_TYPE)
         {
-          error ("vector comparison returning a boolean");
-          debug_generic_expr (op0_type);
-          debug_generic_expr (op1_type);
-          return true;
+	  /* Allow vector comparison returning boolean if operand types
+	     are equal.  */
+	  if (TREE_CODE (op0_type) != TREE_CODE (op1_type)
+	      || TYPE_VECTOR_SUBPARTS (op0_type) != TYPE_VECTOR_SUBPARTS (op1_type)
+	      || GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op0_type))) !=
+		 GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1_type))))
+	    {
+              error ("type mismatch for vector comparison returning a boolean");
+              debug_generic_expr (op0_type);
+              debug_generic_expr (op1_type);
+              return true;
+	    }
         }
     }
+
   /* Or an integer vector type with the same size and element count
      as the comparison operand types.  */
   else if (TREE_CODE (type) == VECTOR_TYPE
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index ccfde5f..13d3948 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -428,6 +428,12 @@ forward_propagate_into_comparison_1 (gimple stmt,
   tree rhs0 = NULL_TREE, rhs1 = NULL_TREE;
   bool single_use0_p = false, single_use1_p = false;
 
+  if (TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE
+      && INTEGRAL_TYPE_P (type)
+      && (TREE_CODE (type) == BOOLEAN_TYPE
+	  || TYPE_PRECISION (type) == 1))
+    return NULL_TREE;
+
   /* For comparisons use the first operand, that is likely to
      simplify comparisons against constants.  */
   if (TREE_CODE (op0) == SSA_NAME)
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f06e57c..c5fcbfb 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2057,6 +2057,7 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
     {
       tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
       prev_stmt_info = NULL;
+      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
       for (i = 0; i < ncopies; i++)
 	{
 	  unsigned align, misalign;
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 85a0cf6..851cb56 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -81,7 +81,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "dbgcnt.h"
 #include "gimple-fold.h"
 #include "tree-scalar-evolution.h"
-
+#include "ssa.h"
+#include "cfghooks.h"
+#include "params.h"
 
 /* Loop or bb location.  */
 source_location vect_location;
@@ -441,6 +443,176 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple loop_vectorized_call)
   free (bbs);
 }
 
+/* Helper for optimize_mask_stores: returns true if STMT sinking to end
+   of BB is valid and false otherwise.  */
+
+static bool
+is_valid_sink (gimple stmt, gimple last_store)
+{
+  tree vdef;
+  imm_use_iterator imm_it;
+  use_operand_p use_p;
+  basic_block bb = gimple_bb (stmt);
+
+  if (is_gimple_call (stmt)
+      && !gimple_call_internal_p (stmt))
+    /* Do not consider non-internal call as valid to sink.  */
+    return false;
+
+  if ((vdef = gimple_vdef (stmt)))
+    {
+      /* Check that ther are no store vuses in current bb.  */
+      FOR_EACH_IMM_USE_FAST (use_p, imm_it, vdef)
+	if (gimple_bb (USE_STMT (use_p)) == bb)
+	  return false;
+      return true;
+    }
+  else if (gimple_vuse (stmt) == NULL_TREE)
+    return true;
+  else if (gimple_vuse (stmt) == gimple_vuse (last_store))
+    return true;
+  return false;
+}
+
+/* The code below is trying to perform simple optimization - do not execute
+   masked store statement if its mask is zero vector since loads that follow
+   a masked store can be blocked.  It puts all masked stores with the same
+   mask-vector into the new bb with a check on zero mask.  */
+
+static void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  auto_vec<gimple> worklist;
+
+  if (ENABLE_ZERO_TEST_FOR_STORE_MASK == 0)
+    return;
+
+  /* Pick up all masked stores in loop if any.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	worklist.safe_push (stmt);
+    }
+
+  if (worklist.is_empty ())
+    return;
+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple last, def_stmt, last_store;
+      edge e, efalse;
+      tree mask;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      tree arg3;
+      tree vdef, new_vdef;
+      gphi *phi;
+      bool first_dump;
+      tree el_type;
+      tree zero;
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      /* Put STORE_BB to likely part.  */
+      efalse->probability = PROB_UNLIKELY;
+      store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
+      if (dom_info_available_p (CDI_DOMINATORS))
+	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
+      /* Create vector comparison with boolean result.  */
+      el_type = TREE_TYPE (TREE_TYPE (mask));
+      zero = build_int_cst (el_type, 0);
+      zero = build_vector_from_val (TREE_TYPE (mask), zero);
+      if (CONSTANT_CLASS_P (mask))
+	fprintf(stderr,"Bad type of mask!\n");
+      if (!CONSTANT_CLASS_P (zero))
+	fprintf (stderr,"Bad type of zero!\n");
+      stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
+      gsi = gsi_last_bb (bb);
+      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
+      /* Create new PHI node for vdef of the last masked store:
+	 .MEM_2 = VDEF <.MEM_1>
+	 will be converted to
+	 .MEM.3 = VDEF <.MEM_1>
+	 and new PHI node will be created in join bb
+	 .MEM_2 = PHI <.MEM_1, .MEM_3>
+      */
+      vdef = gimple_vdef (last);
+      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
+      new_vdef = make_ssa_name (gimple_vop (cfun), last);
+      phi = create_phi_node (vdef, join_bb);
+      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
+      gimple_set_vdef (last, new_vdef);
+      first_dump = true;
+
+      /* Put all masked stores with the same mask to STORE_BB if possible.  */
+      while (true)
+	{
+	  /* Move masked store to STORE_BB.  */
+	  last_store = last;
+	  gsi = gsi_for_stmt (last);
+	  gsi_to = gsi_start_bb (store_bb);
+	  gsi_move_before (&gsi, &gsi_to);
+	  update_stmt (last);
+	  if (dump_enabled_p ())
+	    {
+	      /* Issue different messages depending on FIRST_DUMP.  */
+	      if (first_dump)
+		{
+		  dump_printf (MSG_NOTE, "Move MASK_STORE to new bb#%d\n",
+			       store_bb->index);
+		  first_dump = false;
+		}
+	      else
+		dump_printf (MSG_NOTE, "Move MASK_STORE to created bb\n");
+	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
+	    }
+	  /* Put definition statement of stored value in STORE_BB
+	     if possible.  */
+	  arg3 = gimple_call_arg (last, 3);
+	  if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+	    {
+	      def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      /* Move def_stmt to STORE_BB if it is in the same bb and
+		 it is legal.  */
+	      if (gimple_bb (def_stmt) == bb
+		  && is_valid_sink (def_stmt, last_store))
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf (MSG_NOTE, "Move stmt to created bb\n");
+		      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+		    }
+		  gsi = gsi_for_stmt (def_stmt);
+		  gsi_to = gsi_start_bb (store_bb);
+		  gsi_move_before (&gsi, &gsi_to);
+		  update_stmt (def_stmt);
+		}
+	    }
+	    /* Put other masked stores with the same mask to STORE_BB.  */
+	    if (worklist.is_empty ()
+		|| gimple_call_arg (worklist.last (), 2) != mask
+		|| !is_valid_sink (worklist.last (), last_store))
+	      break;
+	    last = worklist.pop ();
+	}
+      add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
+    }
+}
+
 /* Function vectorize_loops.
 
    Entry point to loop vectorization phase.  */
@@ -557,13 +729,19 @@ vectorize_loops (void)
   for (i = 1; i < vect_loops_num; i++)
     {
       loop_vec_info loop_vinfo;
+      bool has_mask_store;
 
       loop = get_loop (cfun, i);
       if (!loop)
 	continue;
       loop_vinfo = (loop_vec_info) loop->aux;
+      has_mask_store = false;
+      if (loop_vinfo)
+	has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
       destroy_loop_vec_info (loop_vinfo, true);
       loop->aux = NULL;
+      if (has_mask_store)
+	optimize_mask_stores (loop);
     }
 
   free_stmt_vec_info_vec ();
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 48c1f8d..da8e1ea 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -373,6 +373,9 @@ typedef struct _loop_vec_info {
      loop version without if-conversion.  */
   struct loop *scalar_loop;
 
+  /* Mark loops having masked stores.  */
+  bool has_mask_store;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -412,6 +415,7 @@ typedef struct _loop_vec_info {
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
 #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
 #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
+#define LOOP_VINFO_HAS_MASK_STORE(L)      (L)->has_mask_store
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
   ((L)->may_misalign_stmts.length () > 0)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-08-06 11:07                     ` Yuri Rumyantsev
@ 2015-08-13 11:40                       ` Yuri Rumyantsev
  2015-08-13 11:46                         ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-08-13 11:40 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

Hi Richard,

Did you have a chance to look at updated patch?

Thanks.
Yuri.

2015-08-06 14:07 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
> HI All,
>
> Here is updated patch which implements Richard proposal to use vector
> comparison with boolean result instead of target hook. Support for it
> was added to ix86_expand_branch.
>
> Any comments will be appreciated.
>
> Bootstrap and regression testing did not show any new failures.
>
> ChangeLog:
> 2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * config/i386/i386.c (ix86_expand_branch): Implement vector
> comparison with boolean result.
> * config/i386/sse.md (define_expand "cbranch<mode>4): Add define
> for vector comparion.
> * fold-const.c (fold_relational_const): Add handling of vector
> comparison with boolean result.
> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
> * tree-cfg.c (verify_gimple_comparison): Add test for vector
> comparion with boolean result.
> * tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
> propagate vector comparion with boolean result.
> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
> has_mask_store field of vect_info.
> * tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
> (is_valid_sink): New function.
> (optimize_mask_stores): New function.
> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
> stores.
> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
> correspondent macros.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>
>
> 2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>>
>>>>> Is there any rationale given anywhere for the transformation into
>>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>>> GIMPLE_COND where the expression is a vector condition?
>>>>
>>>>
>>>> No rationale for equality compare which would have the semantic of
>>>> having all elements equal or not equal.  But you can't define a sensible
>>>> ordering (that HW implements) for other compare operators and you
>>>> obviously need a single boolean result, not a vector of element comparison
>>>> results.
>>>
>>> Right.  EQ/NE only as others just don't have any real meaning.
>>>
>>>
>>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>>> But one needs to check whether expansion does anything sensible
>>>> with them (either expand to integer subreg compares or add optabs
>>>> for the compares).
>>>
>>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>>> the same conditions.
>>
>> Btw, you can already do this on GIMPLE by doing
>>
>>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>>   if (vec_as_int == 0)
>> ...
>>
>> which is what the RTL will look like in the end.  So not sure if making this
>> higher-level in GIMPLE is good or required.
>>
>> Richard.
>>
>>> jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-08-13 11:40                       ` Yuri Rumyantsev
@ 2015-08-13 11:46                         ` Richard Biener
  2015-11-02 15:24                           ` Yuri Rumyantsev
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-08-13 11:46 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

On Thu, Aug 13, 2015 at 1:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi Richard,
>
> Did you have a chance to look at updated patch?

Having a quick look now.  Btw, you didn't try the simpler alternative of

 tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
 build2 (EQ_EXPR, boolean_type_node,
   build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));

?  That is, use the GIMPLE level equivalent of

 (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))

?  That should be supported by the expander already, though again not sure if
the target(s) have compares that match this.

Btw, the tree-cfg.c hook wasn't what was agreed on - the restriction
on EQ/NE_EXPR
is missing.  Operand type equality is tested anyway.

Why do you need to restrict forward_propagate_into_comparison_1?

Otherwise this looks better, but can you try with the VIEW_CONVERT as well?

Thanks,
Richard.


> Thanks.
> Yuri.
>
> 2015-08-06 14:07 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>> HI All,
>>
>> Here is updated patch which implements Richard proposal to use vector
>> comparison with boolean result instead of target hook. Support for it
>> was added to ix86_expand_branch.
>>
>> Any comments will be appreciated.
>>
>> Bootstrap and regression testing did not show any new failures.
>>
>> ChangeLog:
>> 2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * config/i386/i386.c (ix86_expand_branch): Implement vector
>> comparison with boolean result.
>> * config/i386/sse.md (define_expand "cbranch<mode>4): Add define
>> for vector comparion.
>> * fold-const.c (fold_relational_const): Add handling of vector
>> comparison with boolean result.
>> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
>> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
>> * tree-cfg.c (verify_gimple_comparison): Add test for vector
>> comparion with boolean result.
>> * tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
>> propagate vector comparion with boolean result.
>> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
>> has_mask_store field of vect_info.
>> * tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
>> (is_valid_sink): New function.
>> (optimize_mask_stores): New function.
>> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
>> stores.
>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>> correspondent macros.
>>
>> gcc/testsuite/ChangeLog:
>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>>
>>
>> 2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>>>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>>>
>>>>>> Is there any rationale given anywhere for the transformation into
>>>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>>>> GIMPLE_COND where the expression is a vector condition?
>>>>>
>>>>>
>>>>> No rationale for equality compare which would have the semantic of
>>>>> having all elements equal or not equal.  But you can't define a sensible
>>>>> ordering (that HW implements) for other compare operators and you
>>>>> obviously need a single boolean result, not a vector of element comparison
>>>>> results.
>>>>
>>>> Right.  EQ/NE only as others just don't have any real meaning.
>>>>
>>>>
>>>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>>>> But one needs to check whether expansion does anything sensible
>>>>> with them (either expand to integer subreg compares or add optabs
>>>>> for the compares).
>>>>
>>>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>>>> the same conditions.
>>>
>>> Btw, you can already do this on GIMPLE by doing
>>>
>>>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>>>   if (vec_as_int == 0)
>>> ...
>>>
>>> which is what the RTL will look like in the end.  So not sure if making this
>>> higher-level in GIMPLE is good or required.
>>>
>>> Richard.
>>>
>>>> jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-08-13 11:46                         ` Richard Biener
@ 2015-11-02 15:24                           ` Yuri Rumyantsev
  2015-11-05 15:49                             ` Yuri Rumyantsev
  2015-11-06 12:56                             ` Richard Biener
  0 siblings, 2 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-11-02 15:24 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

Hi Richard,

I've come back to this optimization and try to implement your proposal
for comparison:
> Btw, you didn't try the simpler alternative of
>
> tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
> build2 (EQ_EXPR, boolean_type_node,
>  build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>
> ?  That is, use the GIMPLE level equivalent of
>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))

using the following code:

      vectype = TREE_TYPE (mask);
      ext_mode = mode_for_size (GET_MODE_BITSIZE (TYPE_MODE (vectype)),
MODE_INT, 0);
      ext_type = lang_hooks.types.type_for_mode (ext_mode , 1);

but I've got zero type for it. Should I miss something?

Any help will be appreciated.
Yuri.


2015-08-13 14:40 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Thu, Aug 13, 2015 at 1:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi Richard,
>>
>> Did you have a chance to look at updated patch?
>
> Having a quick look now.  Btw, you didn't try the simpler alternative of
>
>  tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>  build2 (EQ_EXPR, boolean_type_node,
>    build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>
> ?  That is, use the GIMPLE level equivalent of
>
>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>
> ?  That should be supported by the expander already, though again not sure if
> the target(s) have compares that match this.
>
> Btw, the tree-cfg.c hook wasn't what was agreed on - the restriction
> on EQ/NE_EXPR
> is missing.  Operand type equality is tested anyway.
>
> Why do you need to restrict forward_propagate_into_comparison_1?
>
> Otherwise this looks better, but can you try with the VIEW_CONVERT as well?
>
> Thanks,
> Richard.
>
>
>> Thanks.
>> Yuri.
>>
>> 2015-08-06 14:07 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>> HI All,
>>>
>>> Here is updated patch which implements Richard proposal to use vector
>>> comparison with boolean result instead of target hook. Support for it
>>> was added to ix86_expand_branch.
>>>
>>> Any comments will be appreciated.
>>>
>>> Bootstrap and regression testing did not show any new failures.
>>>
>>> ChangeLog:
>>> 2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>
>>> * config/i386/i386.c (ix86_expand_branch): Implement vector
>>> comparison with boolean result.
>>> * config/i386/sse.md (define_expand "cbranch<mode>4): Add define
>>> for vector comparion.
>>> * fold-const.c (fold_relational_const): Add handling of vector
>>> comparison with boolean result.
>>> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
>>> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
>>> * tree-cfg.c (verify_gimple_comparison): Add test for vector
>>> comparion with boolean result.
>>> * tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
>>> propagate vector comparion with boolean result.
>>> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
>>> has_mask_store field of vect_info.
>>> * tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
>>> (is_valid_sink): New function.
>>> (optimize_mask_stores): New function.
>>> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
>>> stores.
>>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>>> correspondent macros.
>>>
>>> gcc/testsuite/ChangeLog:
>>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>>>
>>>
>>> 2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>>>>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>>>>
>>>>>>> Is there any rationale given anywhere for the transformation into
>>>>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>>>>> GIMPLE_COND where the expression is a vector condition?
>>>>>>
>>>>>>
>>>>>> No rationale for equality compare which would have the semantic of
>>>>>> having all elements equal or not equal.  But you can't define a sensible
>>>>>> ordering (that HW implements) for other compare operators and you
>>>>>> obviously need a single boolean result, not a vector of element comparison
>>>>>> results.
>>>>>
>>>>> Right.  EQ/NE only as others just don't have any real meaning.
>>>>>
>>>>>
>>>>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>>>>> But one needs to check whether expansion does anything sensible
>>>>>> with them (either expand to integer subreg compares or add optabs
>>>>>> for the compares).
>>>>>
>>>>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>>>>> the same conditions.
>>>>
>>>> Btw, you can already do this on GIMPLE by doing
>>>>
>>>>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>>>>   if (vec_as_int == 0)
>>>> ...
>>>>
>>>> which is what the RTL will look like in the end.  So not sure if making this
>>>> higher-level in GIMPLE is good or required.
>>>>
>>>> Richard.
>>>>
>>>>> jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-02 15:24                           ` Yuri Rumyantsev
@ 2015-11-05 15:49                             ` Yuri Rumyantsev
  2015-11-06 12:56                             ` Richard Biener
  1 sibling, 0 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-11-05 15:49 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 7230 bytes --]

Hi All!

I prepared another patch which performs insertion additional check on
zero mask for masked stores if only parameter
PARAM_ZERO_TEST_FOR_MASK_STORE has non-zero value. My attempt to use
approach proposed by Richard with simpler alternative for comparison -
use scalar type for 256-bit was not successful and I returned to
vectori comparison with scalar Boolean result.

ChangeLog:
2015-11-05  Yuri Rumyantsev  <ysrumyan@gmail.com>

* config/i386/i386.c: Add conditional initialization of
PARAM_ZERO_TEST_FOR_MASK_STORE.
(ix86_expand_branch): Implement vector comparison with boolean result.
* config/i386/i386.h: New macros TARGET_OPTIMIZE_MASK_STORE.
* config/i386/sse.md (define_expand "cbranch<mode>4): Add define-expand
for vector comparion with eq/ne only.
* config/i386/x86-tune.def: New macros X86_TUNE_OPTIMIZE_MASK_STORE.
* fold-const.c (fold_relational_const): Add handling of vector
comparison with boolean result.
* params.def (PARAM_ZERO_TEST_FOR_MASK_STORE): New DEFPARAM.
* params.h (ENABLE_ZERO_TEST_FOR_MASK_STORE): New macros.
* tree-cfg.c (verify_gimple_comparison): Add argument CODE, allow
comparison of vector operands with boolean result for EQ/NE only.
(verify_gimple_assign_binary): Adjust call for verify_gimple_comparison.
(verify_gimple_cond): Likewise.
* tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
combine vector comparison with boolean result and VEC_COND_EXPR that
has vector result.
* tree-vect-loop.c (is_valid_sink): New function.
(optimize_mask_stores): Likewise.
* tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
has_mask_store field of vect_info.
* tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
vectorized loops having masked stores.
* tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
correspondent macros.
(optimize_mask_stores): Add prototype.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
* gcc.target/i386/avx2-vect-mask-store-move2.c: Likewise.

2015-11-02 18:24 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
> Hi Richard,
>
> I've come back to this optimization and try to implement your proposal
> for comparison:
>> Btw, you didn't try the simpler alternative of
>>
>> tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>> build2 (EQ_EXPR, boolean_type_node,
>>  build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>
>> ?  That is, use the GIMPLE level equivalent of
>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>
> using the following code:
>
>       vectype = TREE_TYPE (mask);
>       ext_mode = mode_for_size (GET_MODE_BITSIZE (TYPE_MODE (vectype)),
> MODE_INT, 0);
>       ext_type = lang_hooks.types.type_for_mode (ext_mode , 1);
>
> but I've got zero type for it. Should I miss something?
>
> Any help will be appreciated.
> Yuri.
>
>
> 2015-08-13 14:40 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Thu, Aug 13, 2015 at 1:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Hi Richard,
>>>
>>> Did you have a chance to look at updated patch?
>>
>> Having a quick look now.  Btw, you didn't try the simpler alternative of
>>
>>  tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>>  build2 (EQ_EXPR, boolean_type_node,
>>    build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>
>> ?  That is, use the GIMPLE level equivalent of
>>
>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>>
>> ?  That should be supported by the expander already, though again not sure if
>> the target(s) have compares that match this.
>>
>> Btw, the tree-cfg.c hook wasn't what was agreed on - the restriction
>> on EQ/NE_EXPR
>> is missing.  Operand type equality is tested anyway.
>>
>> Why do you need to restrict forward_propagate_into_comparison_1?
>>
>> Otherwise this looks better, but can you try with the VIEW_CONVERT as well?
>>
>> Thanks,
>> Richard.
>>
>>
>>> Thanks.
>>> Yuri.
>>>
>>> 2015-08-06 14:07 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>>> HI All,
>>>>
>>>> Here is updated patch which implements Richard proposal to use vector
>>>> comparison with boolean result instead of target hook. Support for it
>>>> was added to ix86_expand_branch.
>>>>
>>>> Any comments will be appreciated.
>>>>
>>>> Bootstrap and regression testing did not show any new failures.
>>>>
>>>> ChangeLog:
>>>> 2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>
>>>> * config/i386/i386.c (ix86_expand_branch): Implement vector
>>>> comparison with boolean result.
>>>> * config/i386/sse.md (define_expand "cbranch<mode>4): Add define
>>>> for vector comparion.
>>>> * fold-const.c (fold_relational_const): Add handling of vector
>>>> comparison with boolean result.
>>>> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
>>>> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
>>>> * tree-cfg.c (verify_gimple_comparison): Add test for vector
>>>> comparion with boolean result.
>>>> * tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
>>>> propagate vector comparion with boolean result.
>>>> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
>>>> has_mask_store field of vect_info.
>>>> * tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
>>>> (is_valid_sink): New function.
>>>> (optimize_mask_stores): New function.
>>>> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
>>>> stores.
>>>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>>>> correspondent macros.
>>>>
>>>> gcc/testsuite/ChangeLog:
>>>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>>>>
>>>>
>>>> 2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>>>>>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>>>>>
>>>>>>>> Is there any rationale given anywhere for the transformation into
>>>>>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>>>>>> GIMPLE_COND where the expression is a vector condition?
>>>>>>>
>>>>>>>
>>>>>>> No rationale for equality compare which would have the semantic of
>>>>>>> having all elements equal or not equal.  But you can't define a sensible
>>>>>>> ordering (that HW implements) for other compare operators and you
>>>>>>> obviously need a single boolean result, not a vector of element comparison
>>>>>>> results.
>>>>>>
>>>>>> Right.  EQ/NE only as others just don't have any real meaning.
>>>>>>
>>>>>>
>>>>>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>>>>>> But one needs to check whether expansion does anything sensible
>>>>>>> with them (either expand to integer subreg compares or add optabs
>>>>>>> for the compares).
>>>>>>
>>>>>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>>>>>> the same conditions.
>>>>>
>>>>> Btw, you can already do this on GIMPLE by doing
>>>>>
>>>>>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>>>>>   if (vec_as_int == 0)
>>>>> ...
>>>>>
>>>>> which is what the RTL will look like in the end.  So not sure if making this
>>>>> higher-level in GIMPLE is good or required.
>>>>>
>>>>> Richard.
>>>>>
>>>>>> jeff

[-- Attachment #2: patch.5 --]
[-- Type: application/octet-stream, Size: 19246 bytes --]

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2a965f6..d41741d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -5313,6 +5313,12 @@ ix86_option_override_internal (bool main_args_p,
 			 opts->x_param_values,
 			 opts_set->x_param_values);
 
+  if (TARGET_OPTIMIZE_MASK_STORE)
+    maybe_set_param_value (PARAM_ZERO_TEST_FOR_MASK_STORE,
+			   1,
+			   opts->x_param_values,
+			   opts_set->x_param_values);
+
   /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
   if (opts->x_flag_prefetch_loop_arrays < 0
       && HAVE_prefetch
@@ -21590,6 +21596,38 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   machine_mode mode = GET_MODE (op0);
   rtx tmp;
 
+  /* Handle special case - vector comparsion with boolean result, transform
+     it using ptest instruction.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      rtx lhs;
+      rtx flag;
+      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+      gcc_assert (code == EQ || code == NE);
+      if (!REG_P (op0))
+	op0 = force_reg (mode, op0);
+      if (!REG_P (op1))
+	op1 = force_reg (mode, op1);
+      /* Generate subtraction since we can't check that one operand is
+	 zero vector.  */
+	  lhs = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (lhs,
+				  gen_rtx_MINUS (mode, op0, op1)));
+      lhs = gen_rtx_SUBREG (p_mode, lhs, 0);
+      tmp = gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
+			  gen_rtx_UNSPEC (CCmode,
+					  gen_rtvec (2, lhs, lhs),
+					  UNSPEC_PTEST));
+      emit_insn (tmp);
+      flag = gen_rtx_REG (CCZmode, FLAGS_REG);
+      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+				  gen_rtx_LABEL_REF (VOIDmode, label),
+				  pc_rtx);
+      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+      return;
+    }
+
   switch (mode)
     {
     case SFmode:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index be96c75..cafc58e 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -496,6 +496,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
     ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
 #define TARGET_AVOID_FALSE_DEP_FOR_BMI \
 	ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
+#define TARGET_OPTIMIZE_MASK_STORE \
+	ix86_tune_features[X86_TUNE_OPTIMIZE_MASK_STORE]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 43dcc6a..0f5fd39 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17999,6 +17999,25 @@
 	  UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
+(define_expand "cbranch<mode>4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:V48_AVX2 1 "nonimmediate_operand")
+		    (match_operand:V48_AVX2 2 "register_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "bt_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_AVX2"
+{
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+
 (define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>"
   [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
 	(unspec:AVX256MODE2P
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index b2d3921..c5e5b63 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -527,6 +527,10 @@ DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
 	  m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
 
+/* X86_TUNE_OPTMIZE_MASK_STORE: Perform masked store if is its mask is not
+   equal to zero.  */
+DEF_TUNE (X86_TUNE_OPTIMIZE_MASK_STORE, "optimize_mask_store", m_HASWELL)
+
 /*****************************************************************************/
 /* This never worked well before.                                            */
 /*****************************************************************************/
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index ee9b349..0ff3307 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -13883,6 +13883,25 @@ fold_relational_const (enum tree_code code, tree type, tree op0, tree op1)
 
   if (TREE_CODE (op0) == VECTOR_CST && TREE_CODE (op1) == VECTOR_CST)
     {
+      if (INTEGRAL_TYPE_P (type)
+	  && (TREE_CODE (type) == BOOLEAN_TYPE
+	      || TYPE_PRECISION (type) == 1))
+	{
+	  /* Have vector comparison with scalar boolean result.  */
+	  bool result = true;
+	  gcc_assert (code == EQ_EXPR || code == NE_EXPR);
+	  gcc_assert (VECTOR_CST_NELTS (op0) == VECTOR_CST_NELTS (op1));
+	  for (unsigned i = 0; i < VECTOR_CST_NELTS (op0); i++)
+	    {
+	      tree elem0 = VECTOR_CST_ELT (op0, i);
+	      tree elem1 = VECTOR_CST_ELT (op1, i);
+	      tree tmp = fold_relational_const (code, type, elem0, elem1);
+	      result &= integer_onep (tmp);
+	    }
+	  if (code == NE_EXPR)
+	    result = !result;
+	  return constant_boolean_node (result, type);
+	}
       unsigned count = VECTOR_CST_NELTS (op0);
       tree *elts =  XALLOCAVEC (tree, count);
       gcc_assert (VECTOR_CST_NELTS (op1) == count
diff --git a/gcc/params.def b/gcc/params.def
index c5d96e7..58bfaed 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1043,6 +1043,12 @@ DEFPARAM (PARAM_MAX_STORES_TO_SINK,
           "Maximum number of conditional store pairs that can be sunk.",
           2, 0, 0)
 
+/* Enable inserion test on zero mask for masked stores if non-zero.  */
+DEFPARAM (PARAM_ZERO_TEST_FOR_MASK_STORE,
+	  "zero-test-for-mask-store",
+	  "Enable insertion of test on zero mask for masked stores",
+	  0, 0, 1)
+
 /* Override CASE_VALUES_THRESHOLD of when to switch from doing switch
    statements via if statements to using a table jump operation.  If the value
    is 0, the default CASE_VALUES_THRESHOLD will be used.  */
diff --git a/gcc/params.h b/gcc/params.h
index 1090d00..2037c73 100644
--- a/gcc/params.h
+++ b/gcc/params.h
@@ -221,6 +221,8 @@ extern void init_param_values (int *params);
   PARAM_VALUE (PARAM_MIN_NONDEBUG_INSN_UID)
 #define MAX_STORES_TO_SINK \
   PARAM_VALUE (PARAM_MAX_STORES_TO_SINK)
+#define ENABLE_ZERO_TEST_FOR_MASK_STORE \
+  PARAM_VALUE (PARAM_ZERO_TEST_FOR_MASK_STORE)
 #define ALLOW_LOAD_DATA_RACES \
   PARAM_VALUE (PARAM_ALLOW_LOAD_DATA_RACES)
 #define ALLOW_STORE_DATA_RACES \
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
new file mode 100644
index 0000000..d926823
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-march=core-avx2 -O3 -fdump-tree-vect-details" } */
+
+extern int *p1, *p2, *p3;
+int c[256];
+void foo (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    if (c[i])
+      {
+	p1[i] += 1;
+	p2[i] = p3[i] +2;
+      }
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move2.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move2.c
new file mode 100644
index 0000000..41d0596
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-march=core-avx2 -O3 -fdump-tree-vect-details" } */
+
+extern int *p1, *p2, *p3;
+int c[256];
+/* All masked load/stores must be put into one new bb.  */
+
+void foo (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    if (c[i])
+      {
+	p1[i] -= 1;
+	p2[i] = p3[i];
+      }
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 1 "vect" } } */
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index cfed3c2..6a9dc27 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -3408,10 +3408,10 @@ verify_gimple_call (gcall *stmt)
 }
 
 /* Verifies the gimple comparison with the result type TYPE and
-   the operands OP0 and OP1.  */
+   the operands OP0 and OP1, comparison code is CODE.  */
 
 static bool
-verify_gimple_comparison (tree type, tree op0, tree op1)
+verify_gimple_comparison (tree type, tree op0, tree op1, enum tree_code code)
 {
   tree op0_type = TREE_TYPE (op0);
   tree op1_type = TREE_TYPE (op1);
@@ -3448,10 +3448,20 @@ verify_gimple_comparison (tree type, tree op0, tree op1)
       if (TREE_CODE (op0_type) == VECTOR_TYPE
 	  || TREE_CODE (op1_type) == VECTOR_TYPE)
         {
-          error ("vector comparison returning a boolean");
-          debug_generic_expr (op0_type);
-          debug_generic_expr (op1_type);
-          return true;
+	  /* Allow vector comparison returning boolean if operand types
+	     are equal and CODE is EQ/NE.  */
+	  if ((code != EQ_EXPR && code != NE_EXPR)
+	      || TREE_CODE (op0_type) != TREE_CODE (op1_type)
+	      || TYPE_VECTOR_SUBPARTS (op0_type)
+		 != TYPE_VECTOR_SUBPARTS (op1_type)
+	      || GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op0_type)))
+		 != GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1_type))))
+	    {
+	      error ("type mismatch for vector comparison returning a boolean");
+	      debug_generic_expr (op0_type);
+	      debug_generic_expr (op1_type);
+	      return true;
+	    }
         }
     }
   /* Or a boolean vector type with the same element count
@@ -3832,7 +3842,7 @@ verify_gimple_assign_binary (gassign *stmt)
     case LTGT_EXPR:
       /* Comparisons are also binary, but the result type is not
 	 connected to the operand types.  */
-      return verify_gimple_comparison (lhs_type, rhs1, rhs2);
+      return verify_gimple_comparison (lhs_type, rhs1, rhs2, rhs_code);
 
     case WIDEN_MULT_EXPR:
       if (TREE_CODE (lhs_type) != INTEGER_TYPE)
@@ -4541,7 +4551,8 @@ verify_gimple_cond (gcond *stmt)
 
   return verify_gimple_comparison (boolean_type_node,
 				   gimple_cond_lhs (stmt),
-				   gimple_cond_rhs (stmt));
+				   gimple_cond_rhs (stmt),
+				   gimple_cond_code (stmt));
 }
 
 /* Verify the GIMPLE statement STMT.  Returns true if there is an
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index b82ae3c..07c1fa7 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -422,6 +422,15 @@ forward_propagate_into_comparison_1 (gimple *stmt,
 	  enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
 	  bool invariant_only_p = !single_use0_p;
 
+	  /* Can't combine vector comparison with scalar boolean type of
+	     the result and VEC_COND_EXPR having vector type of comparison.  */
+	  if (TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE
+	      && INTEGRAL_TYPE_P (type)
+	      && (TREE_CODE (type) == BOOLEAN_TYPE
+		  || TYPE_PRECISION (type) == 1)
+	      && def_code == VEC_COND_EXPR)
+	    return NULL_TREE;
+
 	  rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
 
 	  /* Always combine comparisons or conversions from booleans.  */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 43ada18..f202d98 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -6566,3 +6566,170 @@ vect_transform_loop (loop_vec_info loop_vinfo)
       dump_printf (MSG_NOTE, "\n");
     }
 }
+
+/* Helper for optimize_mask_stores: returns true if STMT sinking to end
+   of BB is valid and false otherwise.  */
+
+static bool
+is_valid_sink (gimple *stmt, gimple *last_store)
+{
+  tree vdef;
+  imm_use_iterator imm_it;
+  use_operand_p use_p;
+  basic_block bb = gimple_bb (stmt);
+
+  if (is_gimple_call (stmt)
+      && !gimple_call_internal_p (stmt))
+    /* Do not consider non-internal call as valid to sink.  */
+    return false;
+
+  if ((vdef = gimple_vdef (stmt)))
+    {
+      /* Check that ther are no store vuses in current bb.  */
+      FOR_EACH_IMM_USE_FAST (use_p, imm_it, vdef)
+	if (gimple_bb (USE_STMT (use_p)) == bb)
+	  return false;
+      return true;
+    }
+  else if (gimple_vuse (stmt) == NULL_TREE)
+    return true;
+  else if (gimple_vuse (stmt) == gimple_vuse (last_store))
+    return true;
+  return false;
+}
+
+/* The code below is trying to perform simple optimization - do not execute
+   masked store statement if its mask is zero vector since loads that follow
+   a masked store can be blocked.  It puts all masked stores with the same
+   mask-vector into the new bb with a check on zero mask.  */
+
+void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple *stmt;
+  auto_vec<gimple *> worklist;
+
+  if (ENABLE_ZERO_TEST_FOR_MASK_STORE == 0)
+    return;
+
+  /* Pick up all masked stores in loop if any.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	worklist.safe_push (stmt);
+    }
+
+  if (worklist.is_empty ())
+    return;
+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple *last, *def_stmt, *last_store;
+      edge e, efalse;
+      tree mask;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      tree arg3;
+      tree vdef, new_vdef;
+      gphi *phi;
+      bool first_dump;
+      tree vectype;
+      tree zero;
+
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      /* Put STORE_BB to likely part.  */
+      efalse->probability = PROB_UNLIKELY;
+      store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
+      if (dom_info_available_p (CDI_DOMINATORS))
+	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
+      /* Create vector comparison with boolean result.  */
+      vectype = TREE_TYPE (mask);
+      zero = build_zero_cst (TREE_TYPE (vectype));
+      zero = build_vector_from_val (vectype, zero);
+      stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
+      gsi = gsi_last_bb (bb);
+      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
+      /* Create new PHI node for vdef of the last masked store:
+	 .MEM_2 = VDEF <.MEM_1>
+	 will be converted to
+	 .MEM.3 = VDEF <.MEM_1>
+	 and new PHI node will be created in join bb
+	 .MEM_2 = PHI <.MEM_1, .MEM_3>
+      */
+      vdef = gimple_vdef (last);
+      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
+      new_vdef = make_ssa_name (gimple_vop (cfun), last);
+      phi = create_phi_node (vdef, join_bb);
+      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
+      gimple_set_vdef (last, new_vdef);
+      first_dump = true;
+
+      /* Put all masked stores with the same mask to STORE_BB if possible.  */
+      while (true)
+	{
+	  /* Move masked store to STORE_BB.  */
+	  last_store = last;
+	  gsi = gsi_for_stmt (last);
+	  gsi_to = gsi_start_bb (store_bb);
+	  gsi_move_before (&gsi, &gsi_to);
+	  update_stmt (last);
+	  if (dump_enabled_p ())
+	    {
+	      /* Issue different messages depending on FIRST_DUMP.  */
+	      if (first_dump)
+		{
+		  dump_printf (MSG_NOTE, "Move MASK_STORE to new bb#%d\n",
+			       store_bb->index);
+		  first_dump = false;
+		}
+	      else
+		dump_printf (MSG_NOTE, "Move MASK_STORE to created bb\n");
+	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
+	    }
+	  /* Put definition statement of stored value in STORE_BB
+	     if possible.  */
+	  arg3 = gimple_call_arg (last, 3);
+	  if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+	    {
+	      def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      /* Move def_stmt to STORE_BB if it is in the same bb and
+		 it is legal.  */
+	      if (gimple_bb (def_stmt) == bb
+		  && is_valid_sink (def_stmt, last_store))
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf (MSG_NOTE, "Move stmt to created bb\n");
+		      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+		    }
+		  gsi = gsi_for_stmt (def_stmt);
+		  gsi_to = gsi_start_bb (store_bb);
+		  gsi_move_before (&gsi, &gsi_to);
+		  update_stmt (def_stmt);
+		}
+	    }
+	    /* Put other masked stores with the same mask to STORE_BB.  */
+	    if (worklist.is_empty ()
+		|| gimple_call_arg (worklist.last (), 2) != mask
+		|| !is_valid_sink (worklist.last (), last_store))
+	      break;
+	    last = worklist.pop ();
+	}
+      add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
+    }
+}
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index ae14075..f8c1e6d 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1968,6 +1968,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
     {
       tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
       prev_stmt_info = NULL;
+      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
       for (i = 0; i < ncopies; i++)
 	{
 	  unsigned align, misalign;
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 7b3d9a3..383b01b 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -556,12 +556,18 @@ vectorize_loops (void)
   for (i = 1; i < vect_loops_num; i++)
     {
       loop_vec_info loop_vinfo;
+      bool has_mask_store;
 
       loop = get_loop (cfun, i);
       if (!loop)
 	continue;
       loop_vinfo = (loop_vec_info) loop->aux;
+      has_mask_store = false;
+      if (loop_vinfo)
+	has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
       destroy_loop_vec_info (loop_vinfo, true);
+      if (has_mask_store)
+	optimize_mask_stores (loop);
       loop->aux = NULL;
     }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f77a4eb..9d06752 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -328,6 +328,9 @@ typedef struct _loop_vec_info : public vec_info {
      loop version without if-conversion.  */
   struct loop *scalar_loop;
 
+  /* Mark loops having masked stores.  */
+  bool has_mask_store;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -365,6 +368,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
 #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
 #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
+#define LOOP_VINFO_HAS_MASK_STORE(L)      (L)->has_mask_store
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
   ((L)->may_misalign_stmts.length () > 0)
@@ -994,6 +998,7 @@ extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *,
 			       vec<tree> *, slp_tree, int);
 extern tree vect_gen_perm_mask_any (tree, const unsigned char *);
 extern tree vect_gen_perm_mask_checked (tree, const unsigned char *);
+extern void optimize_mask_stores (struct loop *);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-02 15:24                           ` Yuri Rumyantsev
  2015-11-05 15:49                             ` Yuri Rumyantsev
@ 2015-11-06 12:56                             ` Richard Biener
  2015-11-06 13:29                               ` Yuri Rumyantsev
  1 sibling, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-11-06 12:56 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

On Mon, Nov 2, 2015 at 4:24 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi Richard,
>
> I've come back to this optimization and try to implement your proposal
> for comparison:
>> Btw, you didn't try the simpler alternative of
>>
>> tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>> build2 (EQ_EXPR, boolean_type_node,
>>  build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>
>> ?  That is, use the GIMPLE level equivalent of
>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>
> using the following code:
>
>       vectype = TREE_TYPE (mask);
>       ext_mode = mode_for_size (GET_MODE_BITSIZE (TYPE_MODE (vectype)),
> MODE_INT, 0);
>       ext_type = lang_hooks.types.type_for_mode (ext_mode , 1);
>
> but I've got zero type for it. Should I miss something?

Use ext_type = build_nonstandard_integer_type (GET_MODE_PRECISION
(ext_mode), 1);

Richard.

> Any help will be appreciated.
> Yuri.
>
>
> 2015-08-13 14:40 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Thu, Aug 13, 2015 at 1:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Hi Richard,
>>>
>>> Did you have a chance to look at updated patch?
>>
>> Having a quick look now.  Btw, you didn't try the simpler alternative of
>>
>>  tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>>  build2 (EQ_EXPR, boolean_type_node,
>>    build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>
>> ?  That is, use the GIMPLE level equivalent of
>>
>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>>
>> ?  That should be supported by the expander already, though again not sure if
>> the target(s) have compares that match this.
>>
>> Btw, the tree-cfg.c hook wasn't what was agreed on - the restriction
>> on EQ/NE_EXPR
>> is missing.  Operand type equality is tested anyway.
>>
>> Why do you need to restrict forward_propagate_into_comparison_1?
>>
>> Otherwise this looks better, but can you try with the VIEW_CONVERT as well?
>>
>> Thanks,
>> Richard.
>>
>>
>>> Thanks.
>>> Yuri.
>>>
>>> 2015-08-06 14:07 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>>> HI All,
>>>>
>>>> Here is updated patch which implements Richard proposal to use vector
>>>> comparison with boolean result instead of target hook. Support for it
>>>> was added to ix86_expand_branch.
>>>>
>>>> Any comments will be appreciated.
>>>>
>>>> Bootstrap and regression testing did not show any new failures.
>>>>
>>>> ChangeLog:
>>>> 2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>
>>>> * config/i386/i386.c (ix86_expand_branch): Implement vector
>>>> comparison with boolean result.
>>>> * config/i386/sse.md (define_expand "cbranch<mode>4): Add define
>>>> for vector comparion.
>>>> * fold-const.c (fold_relational_const): Add handling of vector
>>>> comparison with boolean result.
>>>> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
>>>> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
>>>> * tree-cfg.c (verify_gimple_comparison): Add test for vector
>>>> comparion with boolean result.
>>>> * tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
>>>> propagate vector comparion with boolean result.
>>>> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
>>>> has_mask_store field of vect_info.
>>>> * tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
>>>> (is_valid_sink): New function.
>>>> (optimize_mask_stores): New function.
>>>> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
>>>> stores.
>>>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>>>> correspondent macros.
>>>>
>>>> gcc/testsuite/ChangeLog:
>>>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>>>>
>>>>
>>>> 2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>>>>>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>>>>>
>>>>>>>> Is there any rationale given anywhere for the transformation into
>>>>>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>>>>>> GIMPLE_COND where the expression is a vector condition?
>>>>>>>
>>>>>>>
>>>>>>> No rationale for equality compare which would have the semantic of
>>>>>>> having all elements equal or not equal.  But you can't define a sensible
>>>>>>> ordering (that HW implements) for other compare operators and you
>>>>>>> obviously need a single boolean result, not a vector of element comparison
>>>>>>> results.
>>>>>>
>>>>>> Right.  EQ/NE only as others just don't have any real meaning.
>>>>>>
>>>>>>
>>>>>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>>>>>> But one needs to check whether expansion does anything sensible
>>>>>>> with them (either expand to integer subreg compares or add optabs
>>>>>>> for the compares).
>>>>>>
>>>>>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>>>>>> the same conditions.
>>>>>
>>>>> Btw, you can already do this on GIMPLE by doing
>>>>>
>>>>>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>>>>>   if (vec_as_int == 0)
>>>>> ...
>>>>>
>>>>> which is what the RTL will look like in the end.  So not sure if making this
>>>>> higher-level in GIMPLE is good or required.
>>>>>
>>>>> Richard.
>>>>>
>>>>>> jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-06 12:56                             ` Richard Biener
@ 2015-11-06 13:29                               ` Yuri Rumyantsev
  2015-11-10 12:33                                 ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-11-06 13:29 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

Richard,

I tried it but 256-bit precision integer type is not yet supported.

Yuri.


2015-11-06 15:56 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Mon, Nov 2, 2015 at 4:24 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi Richard,
>>
>> I've come back to this optimization and try to implement your proposal
>> for comparison:
>>> Btw, you didn't try the simpler alternative of
>>>
>>> tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>>> build2 (EQ_EXPR, boolean_type_node,
>>>  build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>>
>>> ?  That is, use the GIMPLE level equivalent of
>>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>>
>> using the following code:
>>
>>       vectype = TREE_TYPE (mask);
>>       ext_mode = mode_for_size (GET_MODE_BITSIZE (TYPE_MODE (vectype)),
>> MODE_INT, 0);
>>       ext_type = lang_hooks.types.type_for_mode (ext_mode , 1);
>>
>> but I've got zero type for it. Should I miss something?
>
> Use ext_type = build_nonstandard_integer_type (GET_MODE_PRECISION
> (ext_mode), 1);
>
> Richard.
>
>> Any help will be appreciated.
>> Yuri.
>>
>>
>> 2015-08-13 14:40 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Thu, Aug 13, 2015 at 1:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Hi Richard,
>>>>
>>>> Did you have a chance to look at updated patch?
>>>
>>> Having a quick look now.  Btw, you didn't try the simpler alternative of
>>>
>>>  tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>>>  build2 (EQ_EXPR, boolean_type_node,
>>>    build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>>
>>> ?  That is, use the GIMPLE level equivalent of
>>>
>>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>>>
>>> ?  That should be supported by the expander already, though again not sure if
>>> the target(s) have compares that match this.
>>>
>>> Btw, the tree-cfg.c hook wasn't what was agreed on - the restriction
>>> on EQ/NE_EXPR
>>> is missing.  Operand type equality is tested anyway.
>>>
>>> Why do you need to restrict forward_propagate_into_comparison_1?
>>>
>>> Otherwise this looks better, but can you try with the VIEW_CONVERT as well?
>>>
>>> Thanks,
>>> Richard.
>>>
>>>
>>>> Thanks.
>>>> Yuri.
>>>>
>>>> 2015-08-06 14:07 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>>>> HI All,
>>>>>
>>>>> Here is updated patch which implements Richard proposal to use vector
>>>>> comparison with boolean result instead of target hook. Support for it
>>>>> was added to ix86_expand_branch.
>>>>>
>>>>> Any comments will be appreciated.
>>>>>
>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>
>>>>> ChangeLog:
>>>>> 2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>
>>>>> * config/i386/i386.c (ix86_expand_branch): Implement vector
>>>>> comparison with boolean result.
>>>>> * config/i386/sse.md (define_expand "cbranch<mode>4): Add define
>>>>> for vector comparion.
>>>>> * fold-const.c (fold_relational_const): Add handling of vector
>>>>> comparison with boolean result.
>>>>> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
>>>>> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
>>>>> * tree-cfg.c (verify_gimple_comparison): Add test for vector
>>>>> comparion with boolean result.
>>>>> * tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
>>>>> propagate vector comparion with boolean result.
>>>>> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
>>>>> has_mask_store field of vect_info.
>>>>> * tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
>>>>> (is_valid_sink): New function.
>>>>> (optimize_mask_stores): New function.
>>>>> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
>>>>> stores.
>>>>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>>>>> correspondent macros.
>>>>>
>>>>> gcc/testsuite/ChangeLog:
>>>>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>>>>>
>>>>>
>>>>> 2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>>>>>>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>>>>>>
>>>>>>>>> Is there any rationale given anywhere for the transformation into
>>>>>>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>>>>>>> GIMPLE_COND where the expression is a vector condition?
>>>>>>>>
>>>>>>>>
>>>>>>>> No rationale for equality compare which would have the semantic of
>>>>>>>> having all elements equal or not equal.  But you can't define a sensible
>>>>>>>> ordering (that HW implements) for other compare operators and you
>>>>>>>> obviously need a single boolean result, not a vector of element comparison
>>>>>>>> results.
>>>>>>>
>>>>>>> Right.  EQ/NE only as others just don't have any real meaning.
>>>>>>>
>>>>>>>
>>>>>>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>>>>>>> But one needs to check whether expansion does anything sensible
>>>>>>>> with them (either expand to integer subreg compares or add optabs
>>>>>>>> for the compares).
>>>>>>>
>>>>>>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>>>>>>> the same conditions.
>>>>>>
>>>>>> Btw, you can already do this on GIMPLE by doing
>>>>>>
>>>>>>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>>>>>>   if (vec_as_int == 0)
>>>>>> ...
>>>>>>
>>>>>> which is what the RTL will look like in the end.  So not sure if making this
>>>>>> higher-level in GIMPLE is good or required.
>>>>>>
>>>>>> Richard.
>>>>>>
>>>>>>> jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-06 13:29                               ` Yuri Rumyantsev
@ 2015-11-10 12:33                                 ` Richard Biener
  2015-11-10 12:48                                   ` Ilya Enkovich
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-11-10 12:33 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Richard,
>
> I tried it but 256-bit precision integer type is not yet supported.

What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
After all we have modes up to XImode.

Richard.

> Yuri.
>
>
> 2015-11-06 15:56 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Mon, Nov 2, 2015 at 4:24 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Hi Richard,
>>>
>>> I've come back to this optimization and try to implement your proposal
>>> for comparison:
>>>> Btw, you didn't try the simpler alternative of
>>>>
>>>> tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>>>> build2 (EQ_EXPR, boolean_type_node,
>>>>  build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>>>
>>>> ?  That is, use the GIMPLE level equivalent of
>>>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>>>
>>> using the following code:
>>>
>>>       vectype = TREE_TYPE (mask);
>>>       ext_mode = mode_for_size (GET_MODE_BITSIZE (TYPE_MODE (vectype)),
>>> MODE_INT, 0);
>>>       ext_type = lang_hooks.types.type_for_mode (ext_mode , 1);
>>>
>>> but I've got zero type for it. Should I miss something?
>>
>> Use ext_type = build_nonstandard_integer_type (GET_MODE_PRECISION
>> (ext_mode), 1);
>>
>> Richard.
>>
>>> Any help will be appreciated.
>>> Yuri.
>>>
>>>
>>> 2015-08-13 14:40 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Thu, Aug 13, 2015 at 1:32 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Hi Richard,
>>>>>
>>>>> Did you have a chance to look at updated patch?
>>>>
>>>> Having a quick look now.  Btw, you didn't try the simpler alternative of
>>>>
>>>>  tree type = type_for_mode (int_mode_for_mode (TYPE_MODE (vectype)));
>>>>  build2 (EQ_EXPR, boolean_type_node,
>>>>    build1 (VIEW_CONVERT, type, op0), build1 (VIEW_CONVERT, type, op1));
>>>>
>>>> ?  That is, use the GIMPLE level equivalent of
>>>>
>>>>  (cmp (subreg:TI reg:V4SI) (subreg:TI reg:V4SI))
>>>>
>>>> ?  That should be supported by the expander already, though again not sure if
>>>> the target(s) have compares that match this.
>>>>
>>>> Btw, the tree-cfg.c hook wasn't what was agreed on - the restriction
>>>> on EQ/NE_EXPR
>>>> is missing.  Operand type equality is tested anyway.
>>>>
>>>> Why do you need to restrict forward_propagate_into_comparison_1?
>>>>
>>>> Otherwise this looks better, but can you try with the VIEW_CONVERT as well?
>>>>
>>>> Thanks,
>>>> Richard.
>>>>
>>>>
>>>>> Thanks.
>>>>> Yuri.
>>>>>
>>>>> 2015-08-06 14:07 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>>>>> HI All,
>>>>>>
>>>>>> Here is updated patch which implements Richard proposal to use vector
>>>>>> comparison with boolean result instead of target hook. Support for it
>>>>>> was added to ix86_expand_branch.
>>>>>>
>>>>>> Any comments will be appreciated.
>>>>>>
>>>>>> Bootstrap and regression testing did not show any new failures.
>>>>>>
>>>>>> ChangeLog:
>>>>>> 2015-08-06  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>>>
>>>>>> * config/i386/i386.c (ix86_expand_branch): Implement vector
>>>>>> comparison with boolean result.
>>>>>> * config/i386/sse.md (define_expand "cbranch<mode>4): Add define
>>>>>> for vector comparion.
>>>>>> * fold-const.c (fold_relational_const): Add handling of vector
>>>>>> comparison with boolean result.
>>>>>> * params.def (PARAM_ZERO_TEST_FOR_STORE_MASK): New DEFPARAM.
>>>>>> * params.h (ENABLE_ZERO_TEST_FOR_STORE_MASK): new macros.
>>>>>> * tree-cfg.c (verify_gimple_comparison): Add test for vector
>>>>>> comparion with boolean result.
>>>>>> * tree-ssa-forwprop.c (forward_propagate_into_comparison_1): Do not
>>>>>> propagate vector comparion with boolean result.
>>>>>> * tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
>>>>>> has_mask_store field of vect_info.
>>>>>> * tree-vectorizer.c: Include files ssa.h, cfghooks.h and params.h.
>>>>>> (is_valid_sink): New function.
>>>>>> (optimize_mask_stores): New function.
>>>>>> (vectorize_loops): Invoke optimaze_mask_stores for loops having masked
>>>>>> stores.
>>>>>> * tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
>>>>>> correspondent macros.
>>>>>>
>>>>>> gcc/testsuite/ChangeLog:
>>>>>> * gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
>>>>>>
>>>>>>
>>>>>> 2015-07-27 11:48 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>>> On Fri, Jul 24, 2015 at 9:11 PM, Jeff Law <law@redhat.com> wrote:
>>>>>>>> On 07/24/2015 03:16 AM, Richard Biener wrote:
>>>>>>>>>>
>>>>>>>>>> Is there any rationale given anywhere for the transformation into
>>>>>>>>>> conditional expressions?  ie, is there any reason why we can't have a
>>>>>>>>>> GIMPLE_COND where the expression is a vector condition?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> No rationale for equality compare which would have the semantic of
>>>>>>>>> having all elements equal or not equal.  But you can't define a sensible
>>>>>>>>> ordering (that HW implements) for other compare operators and you
>>>>>>>>> obviously need a single boolean result, not a vector of element comparison
>>>>>>>>> results.
>>>>>>>>
>>>>>>>> Right.  EQ/NE only as others just don't have any real meaning.
>>>>>>>>
>>>>>>>>
>>>>>>>>> I've already replied that I'm fine allowing ==/!= whole-vector compares.
>>>>>>>>> But one needs to check whether expansion does anything sensible
>>>>>>>>> with them (either expand to integer subreg compares or add optabs
>>>>>>>>> for the compares).
>>>>>>>>
>>>>>>>> Agreed, EQ/NE for whole vector compares only would be fine for me too under
>>>>>>>> the same conditions.
>>>>>>>
>>>>>>> Btw, you can already do this on GIMPLE by doing
>>>>>>>
>>>>>>>   TImode vec_as_int = VIEW_CONVERT_EXPR <TImode> (vec_2);
>>>>>>>   if (vec_as_int == 0)
>>>>>>> ...
>>>>>>>
>>>>>>> which is what the RTL will look like in the end.  So not sure if making this
>>>>>>> higher-level in GIMPLE is good or required.
>>>>>>>
>>>>>>> Richard.
>>>>>>>
>>>>>>>> jeff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-10 12:33                                 ` Richard Biener
@ 2015-11-10 12:48                                   ` Ilya Enkovich
  2015-11-10 14:46                                     ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Ilya Enkovich @ 2015-11-10 12:48 UTC (permalink / raw)
  To: Richard Biener; +Cc: Yuri Rumyantsev, Jeff Law, gcc-patches, Igor Zamyatin

2015-11-10 15:33 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Richard,
>>
>> I tried it but 256-bit precision integer type is not yet supported.
>
> What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
> After all we have modes up to XImode.

I suppose problem may be in:

gcc/config/i386/i386-modes.def:#define MAX_BITSIZE_MODE_ANY_INT (128)

which doesn't allow to create constants of bigger size.  Changing it
to maximum vector size (512) would mean we increase wide_int structure
size significantly. New patterns are probably also needed.

Ilya

>
> Richard.
>
>> Yuri.
>>
>>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-10 12:48                                   ` Ilya Enkovich
@ 2015-11-10 14:46                                     ` Richard Biener
  2015-11-10 14:56                                       ` Ilya Enkovich
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-11-10 14:46 UTC (permalink / raw)
  To: Ilya Enkovich; +Cc: Yuri Rumyantsev, Jeff Law, gcc-patches, Igor Zamyatin

On Tue, Nov 10, 2015 at 1:48 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> 2015-11-10 15:33 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> Richard,
>>>
>>> I tried it but 256-bit precision integer type is not yet supported.
>>
>> What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
>> After all we have modes up to XImode.
>
> I suppose problem may be in:
>
> gcc/config/i386/i386-modes.def:#define MAX_BITSIZE_MODE_ANY_INT (128)
>
> which doesn't allow to create constants of bigger size.  Changing it
> to maximum vector size (512) would mean we increase wide_int structure
> size significantly. New patterns are probably also needed.

Yes, new patterns are needed but wide-int should be fine (we only need to create
a literal zero AFACS).  The "new pattern" would be equality/inequality
against zero
compares only.

Richard.

> Ilya
>
>>
>> Richard.
>>
>>> Yuri.
>>>
>>>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-10 14:46                                     ` Richard Biener
@ 2015-11-10 14:56                                       ` Ilya Enkovich
  2015-11-10 17:02                                         ` Mike Stump
  2015-11-11  9:18                                         ` Richard Biener
  0 siblings, 2 replies; 33+ messages in thread
From: Ilya Enkovich @ 2015-11-10 14:56 UTC (permalink / raw)
  To: Richard Biener; +Cc: Yuri Rumyantsev, Jeff Law, gcc-patches, Igor Zamyatin

2015-11-10 17:46 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Tue, Nov 10, 2015 at 1:48 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>> 2015-11-10 15:33 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>> Richard,
>>>>
>>>> I tried it but 256-bit precision integer type is not yet supported.
>>>
>>> What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
>>> After all we have modes up to XImode.
>>
>> I suppose problem may be in:
>>
>> gcc/config/i386/i386-modes.def:#define MAX_BITSIZE_MODE_ANY_INT (128)
>>
>> which doesn't allow to create constants of bigger size.  Changing it
>> to maximum vector size (512) would mean we increase wide_int structure
>> size significantly. New patterns are probably also needed.
>
> Yes, new patterns are needed but wide-int should be fine (we only need to create
> a literal zero AFACS).  The "new pattern" would be equality/inequality
> against zero
> compares only.

Currently 256bit integer creation fails because wide_int for max and
min values cannot be created.
It is fixed by increasing MAX_BITSIZE_MODE_ANY_INT, but it increases
WIDE_INT_MAX_ELTS
and thus increases wide_int structure. If we use 512 for
MAX_BITSIZE_MODE_ANY_INT then
wide_int structure would grow by 48 bytes (16 bytes if use 256 for
MAX_BITSIZE_MODE_ANY_INT).
Is it OK for such narrow usage?

Ilya

>
> Richard.
>
>> Ilya
>>
>>>
>>> Richard.
>>>
>>>> Yuri.
>>>>
>>>>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-10 14:56                                       ` Ilya Enkovich
@ 2015-11-10 17:02                                         ` Mike Stump
  2015-11-11  9:18                                         ` Richard Biener
  1 sibling, 0 replies; 33+ messages in thread
From: Mike Stump @ 2015-11-10 17:02 UTC (permalink / raw)
  To: Ilya Enkovich
  Cc: Richard Biener, Yuri Rumyantsev, Jeff Law, gcc-patches, Igor Zamyatin

On Nov 10, 2015, at 6:56 AM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> 2015-11-10 17:46 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Tue, Nov 10, 2015 at 1:48 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>>> 2015-11-10 15:33 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Richard,
>>>>> 
>>>>> I tried it but 256-bit precision integer type is not yet supported.
>>>> 
>>>> What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
>>>> After all we have modes up to XImode.
>>> 
>>> I suppose problem may be in:
>>> 
>>> gcc/config/i386/i386-modes.def:#define MAX_BITSIZE_MODE_ANY_INT (128)
>>> 
>>> which doesn't allow to create constants of bigger size.  Changing it
>>> to maximum vector size (512) would mean we increase wide_int structure
>>> size significantly. New patterns are probably also needed.
>> 
>> Yes, new patterns are needed but wide-int should be fine (we only need to create
>> a literal zero AFACS).  The "new pattern" would be equality/inequality
>> against zero
>> compares only.
> 
> Currently 256bit integer creation fails because wide_int for max and
> min values cannot be created.
> It is fixed by increasing MAX_BITSIZE_MODE_ANY_INT, but it increases
> WIDE_INT_MAX_ELTS
> and thus increases wide_int structure. If we use 512 for
> MAX_BITSIZE_MODE_ANY_INT then
> wide_int structure would grow by 48 bytes (16 bytes if use 256 for
> MAX_BITSIZE_MODE_ANY_INT).

Not answering for Richard, but the design of wide-int was that though the temporary space would grow, trees and rtl would not.  Most wide-int values are short lived.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-10 14:56                                       ` Ilya Enkovich
  2015-11-10 17:02                                         ` Mike Stump
@ 2015-11-11  9:18                                         ` Richard Biener
  2015-11-11 13:13                                           ` Yuri Rumyantsev
  1 sibling, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-11-11  9:18 UTC (permalink / raw)
  To: Ilya Enkovich; +Cc: Yuri Rumyantsev, Jeff Law, gcc-patches, Igor Zamyatin

On Tue, Nov 10, 2015 at 3:56 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> 2015-11-10 17:46 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Tue, Nov 10, 2015 at 1:48 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>>> 2015-11-10 15:33 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>> Richard,
>>>>>
>>>>> I tried it but 256-bit precision integer type is not yet supported.
>>>>
>>>> What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
>>>> After all we have modes up to XImode.
>>>
>>> I suppose problem may be in:
>>>
>>> gcc/config/i386/i386-modes.def:#define MAX_BITSIZE_MODE_ANY_INT (128)
>>>
>>> which doesn't allow to create constants of bigger size.  Changing it
>>> to maximum vector size (512) would mean we increase wide_int structure
>>> size significantly. New patterns are probably also needed.
>>
>> Yes, new patterns are needed but wide-int should be fine (we only need to create
>> a literal zero AFACS).  The "new pattern" would be equality/inequality
>> against zero
>> compares only.
>
> Currently 256bit integer creation fails because wide_int for max and
> min values cannot be created.

Hmm, indeed:

#1  0x000000000072dab5 in wi::extended_tree<192>::extended_tree (
    this=0x7fffffffd950, t=0x7ffff6a000b0)
    at /space/rguenther/src/svn/trunk/gcc/tree.h:5125
5125      gcc_checking_assert (TYPE_PRECISION (TREE_TYPE (t)) <= N);

but that's not that the constants fail to be created but

#5  0x00000000010d8828 in build_nonstandard_integer_type (precision=512,
    unsignedp=65) at /space/rguenther/src/svn/trunk/gcc/tree.c:8051
8051      if (tree_fits_uhwi_p (TYPE_MAX_VALUE (itype)))
(gdb) l
8046        fixup_unsigned_type (itype);
8047      else
8048        fixup_signed_type (itype);
8049
8050      ret = itype;
8051      if (tree_fits_uhwi_p (TYPE_MAX_VALUE (itype)))
8052        ret = type_hash_canon (tree_to_uhwi (TYPE_MAX_VALUE
(itype)), itype);

thus the integer type hashing being "interesting".  tree_fits_uhwi_p
fails because
it does

7289    bool
7290    tree_fits_uhwi_p (const_tree t)
7291    {
7292      return (t != NULL_TREE
7293              && TREE_CODE (t) == INTEGER_CST
7294              && wi::fits_uhwi_p (wi::to_widest (t)));
7295    }

and wi::to_widest () fails with doing

5121    template <int N>
5122    inline wi::extended_tree <N>::extended_tree (const_tree t)
5123      : m_t (t)
5124    {
5125      gcc_checking_assert (TYPE_PRECISION (TREE_TYPE (t)) <= N);
5126    }

fixing the hashing then runs into type_cache_hasher::equal doing
tree_int_cst_equal
which again uses to_widest (it should be easier and cheaper to do the compare on
the actual tree representation, but well, seems to be just the first
of various issues
we'd run into).

We eventually could fix the assert above (but then need to hope we assert
when a computation overflows the narrower precision of widest_int) or use
a special really_widest_int (ugh).

> It is fixed by increasing MAX_BITSIZE_MODE_ANY_INT, but it increases
> WIDE_INT_MAX_ELTS
> and thus increases wide_int structure. If we use 512 for
> MAX_BITSIZE_MODE_ANY_INT then
> wide_int structure would grow by 48 bytes (16 bytes if use 256 for
> MAX_BITSIZE_MODE_ANY_INT).
> Is it OK for such narrow usage?

widest_int is used in some long-living structures (which is the reason for
MAX_BITSIZE_MODE_ANY_INT in the first place).  So I don't think so.

Richard.

> Ilya
>
>>
>> Richard.
>>
>>> Ilya
>>>
>>>>
>>>> Richard.
>>>>
>>>>> Yuri.
>>>>>
>>>>>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-11  9:18                                         ` Richard Biener
@ 2015-11-11 13:13                                           ` Yuri Rumyantsev
  2015-11-12 13:59                                             ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-11-11 13:13 UTC (permalink / raw)
  To: Richard Biener; +Cc: Ilya Enkovich, Jeff Law, gcc-patches, Igor Zamyatin

Richard,

What we should do to cope with this problem (structure size increasing)?
Should we return to vector comparison version?

Thanks.
Yuri.

2015-11-11 12:18 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Tue, Nov 10, 2015 at 3:56 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>> 2015-11-10 17:46 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>> On Tue, Nov 10, 2015 at 1:48 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>>>> 2015-11-10 15:33 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>> On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>> Richard,
>>>>>>
>>>>>> I tried it but 256-bit precision integer type is not yet supported.
>>>>>
>>>>> What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
>>>>> After all we have modes up to XImode.
>>>>
>>>> I suppose problem may be in:
>>>>
>>>> gcc/config/i386/i386-modes.def:#define MAX_BITSIZE_MODE_ANY_INT (128)
>>>>
>>>> which doesn't allow to create constants of bigger size.  Changing it
>>>> to maximum vector size (512) would mean we increase wide_int structure
>>>> size significantly. New patterns are probably also needed.
>>>
>>> Yes, new patterns are needed but wide-int should be fine (we only need to create
>>> a literal zero AFACS).  The "new pattern" would be equality/inequality
>>> against zero
>>> compares only.
>>
>> Currently 256bit integer creation fails because wide_int for max and
>> min values cannot be created.
>
> Hmm, indeed:
>
> #1  0x000000000072dab5 in wi::extended_tree<192>::extended_tree (
>     this=0x7fffffffd950, t=0x7ffff6a000b0)
>     at /space/rguenther/src/svn/trunk/gcc/tree.h:5125
> 5125      gcc_checking_assert (TYPE_PRECISION (TREE_TYPE (t)) <= N);
>
> but that's not that the constants fail to be created but
>
> #5  0x00000000010d8828 in build_nonstandard_integer_type (precision=512,
>     unsignedp=65) at /space/rguenther/src/svn/trunk/gcc/tree.c:8051
> 8051      if (tree_fits_uhwi_p (TYPE_MAX_VALUE (itype)))
> (gdb) l
> 8046        fixup_unsigned_type (itype);
> 8047      else
> 8048        fixup_signed_type (itype);
> 8049
> 8050      ret = itype;
> 8051      if (tree_fits_uhwi_p (TYPE_MAX_VALUE (itype)))
> 8052        ret = type_hash_canon (tree_to_uhwi (TYPE_MAX_VALUE
> (itype)), itype);
>
> thus the integer type hashing being "interesting".  tree_fits_uhwi_p
> fails because
> it does
>
> 7289    bool
> 7290    tree_fits_uhwi_p (const_tree t)
> 7291    {
> 7292      return (t != NULL_TREE
> 7293              && TREE_CODE (t) == INTEGER_CST
> 7294              && wi::fits_uhwi_p (wi::to_widest (t)));
> 7295    }
>
> and wi::to_widest () fails with doing
>
> 5121    template <int N>
> 5122    inline wi::extended_tree <N>::extended_tree (const_tree t)
> 5123      : m_t (t)
> 5124    {
> 5125      gcc_checking_assert (TYPE_PRECISION (TREE_TYPE (t)) <= N);
> 5126    }
>
> fixing the hashing then runs into type_cache_hasher::equal doing
> tree_int_cst_equal
> which again uses to_widest (it should be easier and cheaper to do the compare on
> the actual tree representation, but well, seems to be just the first
> of various issues
> we'd run into).
>
> We eventually could fix the assert above (but then need to hope we assert
> when a computation overflows the narrower precision of widest_int) or use
> a special really_widest_int (ugh).
>
>> It is fixed by increasing MAX_BITSIZE_MODE_ANY_INT, but it increases
>> WIDE_INT_MAX_ELTS
>> and thus increases wide_int structure. If we use 512 for
>> MAX_BITSIZE_MODE_ANY_INT then
>> wide_int structure would grow by 48 bytes (16 bytes if use 256 for
>> MAX_BITSIZE_MODE_ANY_INT).
>> Is it OK for such narrow usage?
>
> widest_int is used in some long-living structures (which is the reason for
> MAX_BITSIZE_MODE_ANY_INT in the first place).  So I don't think so.
>
> Richard.
>
>> Ilya
>>
>>>
>>> Richard.
>>>
>>>> Ilya
>>>>
>>>>>
>>>>> Richard.
>>>>>
>>>>>> Yuri.
>>>>>>
>>>>>>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-11 13:13                                           ` Yuri Rumyantsev
@ 2015-11-12 13:59                                             ` Richard Biener
  2015-11-19 15:20                                               ` Yuri Rumyantsev
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2015-11-12 13:59 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Ilya Enkovich, Jeff Law, gcc-patches, Igor Zamyatin

On Wed, Nov 11, 2015 at 2:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Richard,
>
> What we should do to cope with this problem (structure size increasing)?
> Should we return to vector comparison version?

Ok, given this constraint I think the cleanest approach is to allow
integer(!) vector equality(!) compares with scalar result.  This should then
expand via cmp_optab and not via vec_cmp_optab.

On gimple you can then have

 if (mask_vec_1 != {0, 0, .... })
...

Note that a fallback expansion (for optabs.c to try) would be
the suggested view-conversion (aka, subreg) variant using
a same-sized integer mode.

Target maintainers can then choose what is a better fit for
their target (and instruction set as register set constraints may apply).

The patch you posted seems to do this but not restrict the compares
to integer ones (please do that).

       if (TREE_CODE (op0_type) == VECTOR_TYPE
          || TREE_CODE (op1_type) == VECTOR_TYPE)
         {
-          error ("vector comparison returning a boolean");
-          debug_generic_expr (op0_type);
-          debug_generic_expr (op1_type);
-          return true;
+         /* Allow vector comparison returning boolean if operand types
+            are equal and CODE is EQ/NE.  */
+         if ((code != EQ_EXPR && code != NE_EXPR)
+             || TREE_CODE (op0_type) != TREE_CODE (op1_type)
+             || TYPE_VECTOR_SUBPARTS (op0_type)
+                != TYPE_VECTOR_SUBPARTS (op1_type)
+             || GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op0_type)))
+                != GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1_type))))

These are all checked with the useless_type_conversion_p checks done earlier.

As said I'd like to see a

                || ! VECTOR_INTEGER_TYPE_P (op0_type)

check added so we and targets do not need to worry about using EQ/NE vs. CMP
and worry about signed zeros and friends.

+           {
+             error ("type mismatch for vector comparison returning a boolean");
+             debug_generic_expr (op0_type);
+             debug_generic_expr (op1_type);
+             return true;
+           }



--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -422,6 +422,15 @@ forward_propagate_into_comparison_1 (gimple *stmt,
          enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
          bool invariant_only_p = !single_use0_p;

+         /* Can't combine vector comparison with scalar boolean type of
+            the result and VEC_COND_EXPR having vector type of comparison.  */
+         if (TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE
+             && INTEGRAL_TYPE_P (type)
+             && (TREE_CODE (type) == BOOLEAN_TYPE
+                 || TYPE_PRECISION (type) == 1)
+             && def_code == VEC_COND_EXPR)
+           return NULL_TREE;

this hints at larger fallout you paper over here.  So this effectively
means we're trying combining (vec1 != vec2) != 0 for example
and that fails miserably?  If so then the solution is to fix whatever
does not expect this (valid) GENERIC tree.

+  if (ENABLE_ZERO_TEST_FOR_MASK_STORE == 0)
+    return;

not sure if I like a param more than a target hook ... :/

+      /* Create vector comparison with boolean result.  */
+      vectype = TREE_TYPE (mask);
+      zero = build_zero_cst (TREE_TYPE (vectype));
+      zero = build_vector_from_val (vectype, zero);

build_zero_cst (vectype);

+      stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);

you can omit the NULL_TREE operands.

+      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);

please omit the assert.

+      gimple_set_vdef (last, new_vdef);

do this before you create the PHI.

+         /* Put definition statement of stored value in STORE_BB
+            if possible.  */
+         arg3 = gimple_call_arg (last, 3);
+         if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+           {
...

is this really necessary?  It looks incomplete to me anyway.  I'd rather have
a late sink pass if this shows necessary.  Btw,...

+                it is legal.  */
+             if (gimple_bb (def_stmt) == bb
+                 && is_valid_sink (def_stmt, last_store))

with the implementation of is_valid_sink this is effectively

   && (!gimple_vuse (def_stmt)
          || gimple_vuse (def_stmt) == gimple_vdef (last_store))


I still think this "pass" is quite a hack, esp. as it appears as generic
code in a GIMPLE pass.  And esp. as this hack seems to be needed
for Haswell only, not Boradwell or Skylake.

Thanks,
Richard.

> Thanks.
> Yuri.
>
> 2015-11-11 12:18 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Tue, Nov 10, 2015 at 3:56 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>>> 2015-11-10 17:46 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>> On Tue, Nov 10, 2015 at 1:48 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>>>>> 2015-11-10 15:33 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>>>>>> On Fri, Nov 6, 2015 at 2:28 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>>>>>> Richard,
>>>>>>>
>>>>>>> I tried it but 256-bit precision integer type is not yet supported.
>>>>>>
>>>>>> What's the symptom?  The compare cannot be expanded?  Just add a pattern then.
>>>>>> After all we have modes up to XImode.
>>>>>
>>>>> I suppose problem may be in:
>>>>>
>>>>> gcc/config/i386/i386-modes.def:#define MAX_BITSIZE_MODE_ANY_INT (128)
>>>>>
>>>>> which doesn't allow to create constants of bigger size.  Changing it
>>>>> to maximum vector size (512) would mean we increase wide_int structure
>>>>> size significantly. New patterns are probably also needed.
>>>>
>>>> Yes, new patterns are needed but wide-int should be fine (we only need to create
>>>> a literal zero AFACS).  The "new pattern" would be equality/inequality
>>>> against zero
>>>> compares only.
>>>
>>> Currently 256bit integer creation fails because wide_int for max and
>>> min values cannot be created.
>>
>> Hmm, indeed:
>>
>> #1  0x000000000072dab5 in wi::extended_tree<192>::extended_tree (
>>     this=0x7fffffffd950, t=0x7ffff6a000b0)
>>     at /space/rguenther/src/svn/trunk/gcc/tree.h:5125
>> 5125      gcc_checking_assert (TYPE_PRECISION (TREE_TYPE (t)) <= N);
>>
>> but that's not that the constants fail to be created but
>>
>> #5  0x00000000010d8828 in build_nonstandard_integer_type (precision=512,
>>     unsignedp=65) at /space/rguenther/src/svn/trunk/gcc/tree.c:8051
>> 8051      if (tree_fits_uhwi_p (TYPE_MAX_VALUE (itype)))
>> (gdb) l
>> 8046        fixup_unsigned_type (itype);
>> 8047      else
>> 8048        fixup_signed_type (itype);
>> 8049
>> 8050      ret = itype;
>> 8051      if (tree_fits_uhwi_p (TYPE_MAX_VALUE (itype)))
>> 8052        ret = type_hash_canon (tree_to_uhwi (TYPE_MAX_VALUE
>> (itype)), itype);
>>
>> thus the integer type hashing being "interesting".  tree_fits_uhwi_p
>> fails because
>> it does
>>
>> 7289    bool
>> 7290    tree_fits_uhwi_p (const_tree t)
>> 7291    {
>> 7292      return (t != NULL_TREE
>> 7293              && TREE_CODE (t) == INTEGER_CST
>> 7294              && wi::fits_uhwi_p (wi::to_widest (t)));
>> 7295    }
>>
>> and wi::to_widest () fails with doing
>>
>> 5121    template <int N>
>> 5122    inline wi::extended_tree <N>::extended_tree (const_tree t)
>> 5123      : m_t (t)
>> 5124    {
>> 5125      gcc_checking_assert (TYPE_PRECISION (TREE_TYPE (t)) <= N);
>> 5126    }
>>
>> fixing the hashing then runs into type_cache_hasher::equal doing
>> tree_int_cst_equal
>> which again uses to_widest (it should be easier and cheaper to do the compare on
>> the actual tree representation, but well, seems to be just the first
>> of various issues
>> we'd run into).
>>
>> We eventually could fix the assert above (but then need to hope we assert
>> when a computation overflows the narrower precision of widest_int) or use
>> a special really_widest_int (ugh).
>>
>>> It is fixed by increasing MAX_BITSIZE_MODE_ANY_INT, but it increases
>>> WIDE_INT_MAX_ELTS
>>> and thus increases wide_int structure. If we use 512 for
>>> MAX_BITSIZE_MODE_ANY_INT then
>>> wide_int structure would grow by 48 bytes (16 bytes if use 256 for
>>> MAX_BITSIZE_MODE_ANY_INT).
>>> Is it OK for such narrow usage?
>>
>> widest_int is used in some long-living structures (which is the reason for
>> MAX_BITSIZE_MODE_ANY_INT in the first place).  So I don't think so.
>>
>> Richard.
>>
>>> Ilya
>>>
>>>>
>>>> Richard.
>>>>
>>>>> Ilya
>>>>>
>>>>>>
>>>>>> Richard.
>>>>>>
>>>>>>> Yuri.
>>>>>>>
>>>>>>>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] Simple optimization for MASK_STORE.
  2015-11-12 13:59                                             ` Richard Biener
@ 2015-11-19 15:20                                               ` Yuri Rumyantsev
  0 siblings, 0 replies; 33+ messages in thread
From: Yuri Rumyantsev @ 2015-11-19 15:20 UTC (permalink / raw)
  To: Richard Biener; +Cc: Ilya Enkovich, Jeff Law, gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 8000 bytes --]

Hi Richard,

I send you updated version of patch which contains fixes you mentioned
and additional early exit in
register_edge_assert_for() for gcond with vector comparison - it tries
to produce assert for
  if (vect != {0,0,0,0}) but can't create such constant. This is not
essential since this is applied to very specialized context.

My answers are below.

2015-11-12 16:58 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Wed, Nov 11, 2015 at 2:13 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Richard,
>>
>> What we should do to cope with this problem (structure size increasing)?
>> Should we return to vector comparison version?
>
> Ok, given this constraint I think the cleanest approach is to allow
> integer(!) vector equality(!) compares with scalar result.  This should then
> expand via cmp_optab and not via vec_cmp_optab.

  In fact it is expanded through cbranch_optab since the only use of
such comparison is for masked
store motion
>
> On gimple you can then have
>
>  if (mask_vec_1 != {0, 0, .... })
> ...
>
> Note that a fallback expansion (for optabs.c to try) would be
> the suggested view-conversion (aka, subreg) variant using
> a same-sized integer mode.
>
> Target maintainers can then choose what is a better fit for
> their target (and instruction set as register set constraints may apply).
>
> The patch you posted seems to do this but not restrict the compares
> to integer ones (please do that).
>
>        if (TREE_CODE (op0_type) == VECTOR_TYPE
>           || TREE_CODE (op1_type) == VECTOR_TYPE)
>          {
> -          error ("vector comparison returning a boolean");
> -          debug_generic_expr (op0_type);
> -          debug_generic_expr (op1_type);
> -          return true;
> +         /* Allow vector comparison returning boolean if operand types
> +            are equal and CODE is EQ/NE.  */
> +         if ((code != EQ_EXPR && code != NE_EXPR)
> +             || TREE_CODE (op0_type) != TREE_CODE (op1_type)
> +             || TYPE_VECTOR_SUBPARTS (op0_type)
> +                != TYPE_VECTOR_SUBPARTS (op1_type)
> +             || GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op0_type)))
> +                != GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1_type))))
>
> These are all checked with the useless_type_conversion_p checks done earlier.
>
> As said I'd like to see a
>
>                 || ! VECTOR_INTEGER_TYPE_P (op0_type)

  I added check on VECTOR_BOOLEAN_TYPE_P (op0_type) instead since type
of mask was changed.
>
> check added so we and targets do not need to worry about using EQ/NE vs. CMP
> and worry about signed zeros and friends.
>
> +           {
> +             error ("type mismatch for vector comparison returning a boolean");
> +             debug_generic_expr (op0_type);
> +             debug_generic_expr (op1_type);
> +             return true;
> +           }
>
>
>
> --- a/gcc/tree-ssa-forwprop.c
> +++ b/gcc/tree-ssa-forwprop.c
> @@ -422,6 +422,15 @@ forward_propagate_into_comparison_1 (gimple *stmt,
>           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
>           bool invariant_only_p = !single_use0_p;
>
> +         /* Can't combine vector comparison with scalar boolean type of
> +            the result and VEC_COND_EXPR having vector type of comparison.  */
> +         if (TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE
> +             && INTEGRAL_TYPE_P (type)
> +             && (TREE_CODE (type) == BOOLEAN_TYPE
> +                 || TYPE_PRECISION (type) == 1)
> +             && def_code == VEC_COND_EXPR)
> +           return NULL_TREE;
>
> this hints at larger fallout you paper over here.  So this effectively
> means we're trying combining (vec1 != vec2) != 0 for example
> and that fails miserably?  If so then the solution is to fix whatever
> does not expect this (valid) GENERIC tree.

  I changed it to the following check in combine_cond_expr_cond:
  /* Do not perform combining it types are not compatible.  */
  if (TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE
      && !tree_int_cst_equal (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (op0))))
    return NULL_TREE;
>
> +  if (ENABLE_ZERO_TEST_FOR_MASK_STORE == 0)
> +    return;
>
> not sure if I like a param more than a target hook ... :/
  I introduced the param instead of a target hook to make this
transformation user visible, i.e. to switch it on/off
for different targets.
>
> +      /* Create vector comparison with boolean result.  */
> +      vectype = TREE_TYPE (mask);
> +      zero = build_zero_cst (TREE_TYPE (vectype));
> +      zero = build_vector_from_val (vectype, zero);
>
> build_zero_cst (vectype);

Done.
>
> +      stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
>
> you can omit the NULL_TREE operands.

  I did not find such definition for it.
>
> +      gcc_assert (vdef && TREE_CODE (vdef) == SSA_NAME);
>
> please omit the assert.

  Done.
>
> +      gimple_set_vdef (last, new_vdef);
>
> do this before you create the PHI.
>
  Done.
> +         /* Put definition statement of stored value in STORE_BB
> +            if possible.  */
> +         arg3 = gimple_call_arg (last, 3);
> +         if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
> +           {
> ...
>
> is this really necessary?  It looks incomplete to me anyway.  I'd rather have
> a late sink pass if this shows necessary.  Btw,..

 I tried to avoid creation of multiple ne basic blocks for the same
mask and also I don't want to put all semi-hammock guarded by this
mask to separate block to keep it small enough since x86 chips prefer
short branches. Note also that icc does almost the same.
>
> +                it is legal.  */
> +             if (gimple_bb (def_stmt) == bb
> +                 && is_valid_sink (def_stmt, last_store))
>
> with the implementation of is_valid_sink this is effectively
>
>    && (!gimple_vuse (def_stmt)
>           || gimple_vuse (def_stmt) == gimple_vdef (last_store))
I did inlining of correspondent part of "is_valif_sink" to this place.
>
> I still think this "pass" is quite a hack, esp. as it appears as generic
> code in a GIMPLE pass.  And esp. as this hack seems to be needed
> for Haswell only, not Boradwell or Skylake.

This is not truth since for all them this transformation is performed
for skylake and broadwell since both them
belong to HASWELL family.

>
> Thanks,
> Richard.
>

ChangeLog:
2015-11-19  Yuri Rumyantsev  <ysrumyan@gmail.com>

* config/i386/i386.c: Add conditional initialization of
PARAM_ZERO_TEST_FOR_MASK_STORE.
(ix86_expand_branch): Implement vector comparison with boolean result.
* config/i386/i386.h: New macros TARGET_OPTIMIZE_MASK_STORE.
* config/i386/sse.md (define_expand "cbranch<mode>4): Add define-expand
for vector comparion with eq/ne only.
* config/i386/x86-tune.def: New macros X86_TUNE_OPTIMIZE_MASK_STORE.
* fold-const.c (fold_relational_const): Add handling of vector
comparison with boolean result.
* params.def (PARAM_ZERO_TEST_FOR_MASK_STORE): New DEFPARAM.
* params.h (ENABLE_ZERO_TEST_FOR_MASK_STORE): New macros.
* tree-cfg.c (verify_gimple_comparison): Add argument CODE, allow
comparison of vector operands with boolean result for EQ/NE only.
(verify_gimple_assign_binary): Adjust call for verify_gimple_comparison.
(verify_gimple_cond): Likewise.
* tree-ssa-forwprop.c (combine_cond_expr_cond): Do not perform
combining for non-compatible vector types.
* tree-vect-loop.c (is_valid_sink): New function.
(optimize_mask_stores): Likewise.
* tree-vect-stmts.c (vectorizable_mask_load_store): Initialize
has_mask_store field of vect_info.
* tree-vectorizer.c (vectorize_loops): Invoke optimaze_mask_stores for
vectorized loops having masked stores.
* tree-vectorizer.h (loop_vec_info): Add new has_mask_store field and
correspondent macros.
(optimize_mask_stores): Add prototype.
* tree-vrp.c (register_edge_assert_for): Do not handle NAME with vector
type.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-vect-mask-store-move1.c: New test.
* gcc.target/i386/avx2-vect-mask-store-move2.c: Likewise.

[-- Attachment #2: patch.6 --]
[-- Type: application/octet-stream, Size: 19415 bytes --]

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 83749d5..5a515f8 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -5326,6 +5326,12 @@ ix86_option_override_internal (bool main_args_p,
 			 opts->x_param_values,
 			 opts_set->x_param_values);
 
+  if (TARGET_OPTIMIZE_MASK_STORE)
+    maybe_set_param_value (PARAM_ZERO_TEST_FOR_MASK_STORE,
+			   1,
+			   opts->x_param_values,
+			   opts_set->x_param_values);
+
   /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
   if (opts->x_flag_prefetch_loop_arrays < 0
       && HAVE_prefetch
@@ -21641,6 +21647,38 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   machine_mode mode = GET_MODE (op0);
   rtx tmp;
 
+  /* Handle special case - vector comparsion with boolean result, transform
+     it using ptest instruction.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      rtx lhs;
+      rtx flag;
+      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+      gcc_assert (code == EQ || code == NE);
+      if (!REG_P (op0))
+	op0 = force_reg (mode, op0);
+      if (!REG_P (op1))
+	op1 = force_reg (mode, op1);
+      /* Generate subtraction since we can't check that one operand is
+	 zero vector.  */
+	  lhs = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (lhs,
+				  gen_rtx_MINUS (mode, op0, op1)));
+      lhs = gen_rtx_SUBREG (p_mode, lhs, 0);
+      tmp = gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
+			  gen_rtx_UNSPEC (CCmode,
+					  gen_rtvec (2, lhs, lhs),
+					  UNSPEC_PTEST));
+      emit_insn (tmp);
+      flag = gen_rtx_REG (CCZmode, FLAGS_REG);
+      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+				  gen_rtx_LABEL_REF (VOIDmode, label),
+				  pc_rtx);
+      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+      return;
+    }
+
   switch (mode)
     {
     case SFmode:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index ceda472..5133216 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -496,6 +496,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
     ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
 #define TARGET_AVOID_FALSE_DEP_FOR_BMI \
 	ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
+#define TARGET_OPTIMIZE_MASK_STORE \
+	ix86_tune_features[X86_TUNE_OPTIMIZE_MASK_STORE]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e7b517a..e149cb3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18340,6 +18340,25 @@
 	  (match_operand:<avx512fmaskmode> 2 "register_operand")))]
   "TARGET_AVX512BW")
 
+(define_expand "cbranch<mode>4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:V48_AVX2 1 "nonimmediate_operand")
+		    (match_operand:V48_AVX2 2 "register_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "bt_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_AVX2"
+{
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+
 (define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>"
   [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
 	(unspec:AVX256MODE2P
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index b2d3921..c5e5b63 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -527,6 +527,10 @@ DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
 	  m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
 
+/* X86_TUNE_OPTMIZE_MASK_STORE: Perform masked store if is its mask is not
+   equal to zero.  */
+DEF_TUNE (X86_TUNE_OPTIMIZE_MASK_STORE, "optimize_mask_store", m_HASWELL)
+
 /*****************************************************************************/
 /* This never worked well before.                                            */
 /*****************************************************************************/
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index 698062e..9988be1 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -13888,6 +13888,25 @@ fold_relational_const (enum tree_code code, tree type, tree op0, tree op1)
 
   if (TREE_CODE (op0) == VECTOR_CST && TREE_CODE (op1) == VECTOR_CST)
     {
+      if (INTEGRAL_TYPE_P (type)
+	  && (TREE_CODE (type) == BOOLEAN_TYPE
+	      || TYPE_PRECISION (type) == 1))
+	{
+	  /* Have vector comparison with scalar boolean result.  */
+	  bool result = true;
+	  gcc_assert (code == EQ_EXPR || code == NE_EXPR);
+	  gcc_assert (VECTOR_CST_NELTS (op0) == VECTOR_CST_NELTS (op1));
+	  for (unsigned i = 0; i < VECTOR_CST_NELTS (op0); i++)
+	    {
+	      tree elem0 = VECTOR_CST_ELT (op0, i);
+	      tree elem1 = VECTOR_CST_ELT (op1, i);
+	      tree tmp = fold_relational_const (code, type, elem0, elem1);
+	      result &= integer_onep (tmp);
+	    }
+	  if (code == NE_EXPR)
+	    result = !result;
+	  return constant_boolean_node (result, type);
+	}
       unsigned count = VECTOR_CST_NELTS (op0);
       tree *elts =  XALLOCAVEC (tree, count);
       gcc_assert (VECTOR_CST_NELTS (op1) == count
diff --git a/gcc/params.def b/gcc/params.def
index 41fd8a8..d228477 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1043,6 +1043,12 @@ DEFPARAM (PARAM_MAX_STORES_TO_SINK,
           "Maximum number of conditional store pairs that can be sunk.",
           2, 0, 0)
 
+/* Enable inserion test on zero mask for masked stores if non-zero.  */
+DEFPARAM (PARAM_ZERO_TEST_FOR_MASK_STORE,
+	  "zero-test-for-mask-store",
+	  "Enable insertion of test on zero mask for masked stores",
+	  0, 0, 1)
+
 /* Override CASE_VALUES_THRESHOLD of when to switch from doing switch
    statements via if statements to using a table jump operation.  If the value
    is 0, the default CASE_VALUES_THRESHOLD will be used.  */
diff --git a/gcc/params.h b/gcc/params.h
index 1090d00..2037c73 100644
--- a/gcc/params.h
+++ b/gcc/params.h
@@ -221,6 +221,8 @@ extern void init_param_values (int *params);
   PARAM_VALUE (PARAM_MIN_NONDEBUG_INSN_UID)
 #define MAX_STORES_TO_SINK \
   PARAM_VALUE (PARAM_MAX_STORES_TO_SINK)
+#define ENABLE_ZERO_TEST_FOR_MASK_STORE \
+  PARAM_VALUE (PARAM_ZERO_TEST_FOR_MASK_STORE)
 #define ALLOW_LOAD_DATA_RACES \
   PARAM_VALUE (PARAM_ALLOW_LOAD_DATA_RACES)
 #define ALLOW_STORE_DATA_RACES \
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
new file mode 100755
index 0000000..60bb841
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=core-avx2 -O3 -fdump-tree-vect-details" } */
+
+extern int *p1, *p2, *p3;
+int c[256];
+void foo (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    if (c[i])
+      {
+	p1[i] += 1;
+	p2[i] = p3[i] +2;
+      }
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move2.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move2.c
new file mode 100755
index 0000000..a383b99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=core-avx2 -O3 -fdump-tree-vect-details" } */
+
+extern int *p1, *p2, *p3;
+int c[256];
+/* All masked load/stores must be put into one new bb.  */
+
+void foo (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    if (c[i])
+      {
+	p1[i] -= 1;
+	p2[i] = p3[i];
+      }
+}
+
+/* { dg-final { scan-tree-dump-times "Move MASK_STORE to new bb" 1 "vect" } } */
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
old mode 100644
new mode 100755
index 0c624aa..cfde379
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -3408,10 +3408,10 @@ verify_gimple_call (gcall *stmt)
 }
 
 /* Verifies the gimple comparison with the result type TYPE and
-   the operands OP0 and OP1.  */
+   the operands OP0 and OP1, comparison code is CODE.  */
 
 static bool
-verify_gimple_comparison (tree type, tree op0, tree op1)
+verify_gimple_comparison (tree type, tree op0, tree op1, enum tree_code code)
 {
   tree op0_type = TREE_TYPE (op0);
   tree op1_type = TREE_TYPE (op1);
@@ -3448,10 +3448,16 @@ verify_gimple_comparison (tree type, tree op0, tree op1)
       if (TREE_CODE (op0_type) == VECTOR_TYPE
 	  || TREE_CODE (op1_type) == VECTOR_TYPE)
         {
-          error ("vector comparison returning a boolean");
-          debug_generic_expr (op0_type);
-          debug_generic_expr (op1_type);
-          return true;
+	  /* Allow vector comparison returning boolean if operand types
+	     are equal and CODE is EQ/NE.  */
+	  if ((code != EQ_EXPR && code != NE_EXPR)
+	      || !VECTOR_BOOLEAN_TYPE_P (op0_type))
+	    {
+	      error ("type mismatch for vector comparison returning a boolean");
+	      debug_generic_expr (op0_type);
+	      debug_generic_expr (op1_type);
+	      return true;
+	    }
         }
     }
   /* Or a boolean vector type with the same element count
@@ -3832,7 +3838,7 @@ verify_gimple_assign_binary (gassign *stmt)
     case LTGT_EXPR:
       /* Comparisons are also binary, but the result type is not
 	 connected to the operand types.  */
-      return verify_gimple_comparison (lhs_type, rhs1, rhs2);
+      return verify_gimple_comparison (lhs_type, rhs1, rhs2, rhs_code);
 
     case WIDEN_MULT_EXPR:
       if (TREE_CODE (lhs_type) != INTEGER_TYPE)
@@ -4541,7 +4547,8 @@ verify_gimple_cond (gcond *stmt)
 
   return verify_gimple_comparison (boolean_type_node,
 				   gimple_cond_lhs (stmt),
-				   gimple_cond_rhs (stmt));
+				   gimple_cond_rhs (stmt),
+				   gimple_cond_code (stmt));
 }
 
 /* Verify the GIMPLE statement STMT.  Returns true if there is an
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index b82ae3c..73ee3be 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -373,6 +373,11 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
 
   gcc_assert (TREE_CODE_CLASS (code) == tcc_comparison);
 
+  /* Do not perform combining it types are not compatible.  */
+  if (TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE
+      && !tree_int_cst_equal (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (op0))))
+    return NULL_TREE;
+
   fold_defer_overflow_warnings ();
   t = fold_binary_loc (gimple_location (stmt), code, type, op0, op1);
   if (!t)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index c3dbfd3..4a247c8 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -6826,3 +6826,169 @@ vect_transform_loop (loop_vec_info loop_vinfo)
       dump_printf (MSG_NOTE, "\n");
     }
 }
+
+/* Helper for optimize_mask_stores: returns true if STMT sinking to end
+   of BB is valid and false otherwise.  */
+
+static bool
+is_valid_sink (gimple *stmt, gimple *last_store)
+{
+  tree vdef;
+  imm_use_iterator imm_it;
+  use_operand_p use_p;
+  basic_block bb = gimple_bb (stmt);
+
+  if (is_gimple_call (stmt)
+      && !gimple_call_internal_p (stmt))
+    /* Do not consider non-internal call as valid to sink.  */
+    return false;
+
+  if ((vdef = gimple_vdef (stmt)))
+    {
+      /* Check that ther are no store vuses in current bb.  */
+      FOR_EACH_IMM_USE_FAST (use_p, imm_it, vdef)
+	if (gimple_bb (USE_STMT (use_p)) == bb)
+	  return false;
+      return true;
+    }
+  else if (gimple_vuse (stmt) == NULL_TREE)
+    return true;
+  else if (gimple_vuse (stmt) == gimple_vuse (last_store))
+    return true;
+  return false;
+}
+
+/* The code below is trying to perform simple optimization - do not execute
+   masked store statement if its mask is zero vector since loads that follow
+   a masked store can be blocked.  It puts all masked stores with the same
+   mask-vector into the new bb with a check on zero mask.  */
+
+void
+optimize_mask_stores (struct loop *loop)
+{
+  basic_block bb = loop->header;
+  gimple_stmt_iterator gsi;
+  gimple *stmt;
+  auto_vec<gimple *> worklist;
+
+  if (ENABLE_ZERO_TEST_FOR_MASK_STORE == 0)
+    return;
+
+  /* Pick up all masked stores in loop if any.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	worklist.safe_push (stmt);
+    }
+
+  if (worklist.is_empty ())
+    return;
+
+  /* Loop has masked stores.  */
+  while (!worklist.is_empty ())
+    {
+      gimple *last, *def_stmt, *last_store;
+      edge e, efalse;
+      tree mask;
+      basic_block store_bb, join_bb;
+      gimple_stmt_iterator gsi_to;
+      tree arg3;
+      tree vdef, new_vdef;
+      gphi *phi;
+      bool first_dump;
+      tree vectype;
+      tree zero;
+
+      last = worklist.pop ();
+      mask = gimple_call_arg (last, 2);
+      /* Create new bb.  */
+      e = split_block (bb, last);
+      join_bb = e->dest;
+      store_bb = create_empty_bb (bb);
+      add_bb_to_loop (store_bb, loop);
+      e->flags = EDGE_TRUE_VALUE;
+      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
+      /* Put STORE_BB to likely part.  */
+      efalse->probability = PROB_UNLIKELY;
+      store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
+      make_edge (store_bb, join_bb, EDGE_FALLTHRU);
+      if (dom_info_available_p (CDI_DOMINATORS))
+	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
+      /* Create vector comparison with boolean result.  */
+      vectype = TREE_TYPE (mask);
+      zero = build_zero_cst (vectype);
+      stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
+      gsi = gsi_last_bb (bb);
+      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
+      /* Create new PHI node for vdef of the last masked store:
+	 .MEM_2 = VDEF <.MEM_1>
+	 will be converted to
+	 .MEM.3 = VDEF <.MEM_1>
+	 and new PHI node will be created in join bb
+	 .MEM_2 = PHI <.MEM_1, .MEM_3>
+      */
+      vdef = gimple_vdef (last);
+      new_vdef = make_ssa_name (gimple_vop (cfun), last);
+      gimple_set_vdef (last, new_vdef);
+      phi = create_phi_node (vdef, join_bb);
+      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
+      first_dump = true;
+
+      /* Put all masked stores with the same mask to STORE_BB if possible.  */
+      while (true)
+	{
+	  /* Move masked store to STORE_BB.  */
+	  last_store = last;
+	  gsi = gsi_for_stmt (last);
+	  gsi_to = gsi_start_bb (store_bb);
+	  gsi_move_before (&gsi, &gsi_to);
+	  update_stmt (last);
+	  if (dump_enabled_p ())
+	    {
+	      /* Issue different messages depending on FIRST_DUMP.  */
+	      if (first_dump)
+		{
+		  dump_printf (MSG_NOTE, "Move MASK_STORE to new bb#%d\n",
+			       store_bb->index);
+		  first_dump = false;
+		}
+	      else
+		dump_printf (MSG_NOTE, "Move MASK_STORE to created bb\n");
+	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
+	    }
+	  /* Put definition statement of stored value in STORE_BB
+	     if possible.  */
+	  arg3 = gimple_call_arg (last, 3);
+	  if (TREE_CODE (arg3) == SSA_NAME && has_single_use (arg3))
+	    {
+	      def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      /* Move def_stmt to STORE_BB if it is in the same BB and
+		 it is valid sink.  */
+	      if (gimple_bb (def_stmt) == bb
+		  && (!gimple_vuse (def_stmt)
+		      || gimple_vuse (def_stmt) == gimple_vuse (last_store)))
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf (MSG_NOTE, "Move stmt to created bb\n");
+		      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+		    }
+		  gsi = gsi_for_stmt (def_stmt);
+		  gsi_to = gsi_start_bb (store_bb);
+		  gsi_move_before (&gsi, &gsi_to);
+		  update_stmt (def_stmt);
+		}
+	    }
+	    /* Put other masked stores with the same mask to STORE_BB.  */
+	    if (worklist.is_empty ()
+		|| gimple_call_arg (worklist.last (), 2) != mask
+		|| !is_valid_sink (worklist.last (), last_store))
+	      break;
+	    last = worklist.pop ();
+	}
+      add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
+    }
+}
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 4bb58b9..55b1956 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2000,6 +2000,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
     {
       tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
       prev_stmt_info = NULL;
+      LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
       for (i = 0; i < ncopies; i++)
 	{
 	  unsigned align, misalign;
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index b721c56..6732616 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -598,12 +598,18 @@ vectorize_loops (void)
   for (i = 1; i < vect_loops_num; i++)
     {
       loop_vec_info loop_vinfo;
+      bool has_mask_store;
 
       loop = get_loop (cfun, i);
       if (!loop)
 	continue;
       loop_vinfo = (loop_vec_info) loop->aux;
+      has_mask_store = false;
+      if (loop_vinfo)
+	has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
       destroy_loop_vec_info (loop_vinfo, true);
+      if (has_mask_store)
+	optimize_mask_stores (loop);
       loop->aux = NULL;
     }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 7867c26..040051c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -330,6 +330,9 @@ typedef struct _loop_vec_info : public vec_info {
      loop version without if-conversion.  */
   struct loop *scalar_loop;
 
+  /* Mark loops having masked stores.  */
+  bool has_mask_store;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -367,6 +370,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
 #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
 #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
+#define LOOP_VINFO_HAS_MASK_STORE(L)      (L)->has_mask_store
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
   ((L)->may_misalign_stmts.length () > 0)
@@ -1001,6 +1005,7 @@ extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *,
 			       vec<tree> *, slp_tree, int);
 extern tree vect_gen_perm_mask_any (tree, const unsigned char *);
 extern tree vect_gen_perm_mask_checked (tree, const unsigned char *);
+extern void optimize_mask_stores (struct loop *);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index e67048e..1605520c 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -5760,6 +5760,12 @@ register_edge_assert_for (tree name, edge e, gimple_stmt_iterator si,
 						&comp_code, &val))
     return;
 
+  /* Use of vector comparison in gcond is very restricted and used to check
+     that the mask in masked store is zero, so assert for such comparison
+     is not implemented yet.  */
+  if (TREE_CODE (TREE_TYPE (name)) == VECTOR_TYPE)
+    return;
+
   /* Register ASSERT_EXPRs for name.  */
   register_edge_assert_for_2 (name, e, si, cond_code, cond_op0,
 			      cond_op1, is_else_edge);

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2015-11-19 15:20 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-06 14:04 [PATCH] Simple optimization for MASK_STORE Yuri Rumyantsev
2015-05-08  9:27 ` Richard Biener
2015-05-08 18:43   ` Jeff Law
2015-05-08 19:16     ` Richard Biener
2015-05-20 14:10   ` Yuri Rumyantsev
2015-05-29 14:28     ` Yuri Rumyantsev
2015-06-09 12:15     ` Richard Biener
2015-06-18 15:41       ` Yuri Rumyantsev
2015-07-07 13:55         ` Yuri Rumyantsev
2015-07-10  5:51         ` Jeff Law
2015-07-20 15:26           ` Yuri Rumyantsev
2015-07-21 13:59             ` Richard Biener
2015-07-23 20:32             ` Jeff Law
2015-07-24  9:04               ` Yuri Rumyantsev
2015-07-24  9:24               ` Richard Biener
2015-07-24 19:26                 ` Jeff Law
2015-07-27  9:04                   ` Richard Biener
2015-08-06 11:07                     ` Yuri Rumyantsev
2015-08-13 11:40                       ` Yuri Rumyantsev
2015-08-13 11:46                         ` Richard Biener
2015-11-02 15:24                           ` Yuri Rumyantsev
2015-11-05 15:49                             ` Yuri Rumyantsev
2015-11-06 12:56                             ` Richard Biener
2015-11-06 13:29                               ` Yuri Rumyantsev
2015-11-10 12:33                                 ` Richard Biener
2015-11-10 12:48                                   ` Ilya Enkovich
2015-11-10 14:46                                     ` Richard Biener
2015-11-10 14:56                                       ` Ilya Enkovich
2015-11-10 17:02                                         ` Mike Stump
2015-11-11  9:18                                         ` Richard Biener
2015-11-11 13:13                                           ` Yuri Rumyantsev
2015-11-12 13:59                                             ` Richard Biener
2015-11-19 15:20                                               ` Yuri Rumyantsev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).