public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [RFC] Combine vectorized loops with its scalar remainder.
@ 2015-10-28 10:57 Yuri Rumyantsev
  2015-11-03 10:08 ` Richard Henderson
  2015-11-03 11:47 ` Richard Biener
  0 siblings, 2 replies; 17+ messages in thread
From: Yuri Rumyantsev @ 2015-10-28 10:57 UTC (permalink / raw)
  To: gcc-patches, Jeff Law, Richard Biener, Igor Zamyatin,
	Илья
	Энкович

[-- Attachment #1: Type: text/plain, Size: 1663 bytes --]

Hi All,

Here is a preliminary patch to combine vectorized loop with its scalar
remainder, draft of which was proposed by Kirill Yukhin month ago:
https://gcc.gnu.org/ml/gcc-patches/2015-09/msg01435.html
It was tested wwith '-mavx2' option to run on Haswell processor.
The main goal of it is to improve performance of vectorized loops for AVX512.
Note that only loads/stores and simple reductions with binary operations are
converted to masked form, e.g. load --> masked load and reduction like
r1 = f <op> r2 --> t = f <op> r2; r1 = m ? t : r2. Masking is performed through
creation of a new vector induction variable initialized with consequent values
from 0.. VF-1, new const vector upper bound which contains number of iterations
and the result of comparison which is considered as mask vector.
This implementation has several restrictions:

1. Multiple types are not supported.
2. SLP is not supported.
3. Gather/Scatter's are also not supported.
4. Vectorization of the loops with low trip count is not implemented yet since
   it requires additional design and tuning.

We are planning to eleminate all these restrictions in GCCv7.

This patch will be extended to include cost model to reject unprofutable
transformations, e.g. new vector body cost will be evaluated through new
target hook which estimates cast of masking different vector statements. New
threshold parameter will be introduced which determines permissible cost
increasing which will be tuned on an AVX512 machine.
This patch is not in sync with changes of Ilya Enkovich for AVX512 masked
load/store support since only part of them is in trunk compiler.

Any comments will be appreciated.

[-- Attachment #2: remainder.patch.1 --]
[-- Type: application/octet-stream, Size: 27229 bytes --]

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index cc51597..b85bfc5 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -938,6 +938,7 @@ new_loop_vec_info (struct loop *loop)
   LOOP_VINFO_NITERSM1 (res) = NULL;
   LOOP_VINFO_NITERS (res) = NULL;
   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
+  LOOP_VINFO_NITERS_VECT_LOOP (res) = NULL;
   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
@@ -6232,9 +6233,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
     {
       tree ratio_mult_vf;
       if (!ni_name)
-	ni_name = vect_build_loop_niters (loop_vinfo);
+	{
+	  ni_name = vect_build_loop_niters (loop_vinfo);
+	  LOOP_VINFO_NITERS (loop_vinfo) = ni_name;
+	}
       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
 				       &ratio);
+      LOOP_VINFO_NITERS_VECT_LOOP (loop_vinfo) = ratio_mult_vf;
       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
 				      th, check_profitability);
     }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 82fca0c..02e1359 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -52,6 +52,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "cgraph.h"
 #include "builtins.h"
+#include "tree-ssa-address.h"
+#include "tree-ssa-loop-ivopts.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -8627,3 +8629,714 @@ supportable_narrowing_operation (enum tree_code code,
   interm_types->release ();
   return false;
 }
+
+/* Fix trip count of vectorized loop to iterate for loop remainder also.  */
+
+static void
+fix_vec_loop_trip_count (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree niters;
+  tree ratio_mult_vf = LOOP_VINFO_NITERS_VECT_LOOP (loop_vinfo);
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  gimple *stmt;
+  gimple_stmt_iterator gsi;
+
+  niters = (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) ?
+	    LOOP_VINFO_NITERS (loop_vinfo)
+	    : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo);
+
+  if (TREE_CODE (ratio_mult_vf) == SSA_NAME)
+    {
+      gimple *def = SSA_NAME_DEF_STMT (ratio_mult_vf);
+      tree bnd, lhs, tmp, log_vf;
+      gimple *def_bnd;
+      gimple *new_def_bnd;
+      gcc_assert (gimple_code (def) == GIMPLE_ASSIGN);
+      gcc_assert (gimple_assign_rhs_code (def) == LSHIFT_EXPR);
+      bnd = gimple_assign_rhs1 (def);
+      gcc_assert (TREE_CODE (bnd) == SSA_NAME);
+      gcc_assert (TREE_CODE (gimple_assign_rhs2 (def)) == INTEGER_CST);
+      def_bnd = SSA_NAME_DEF_STMT (bnd);
+      gsi = gsi_for_stmt (def_bnd);
+      /* Create t = niters + vfm1 statement.  */
+      lhs = create_tmp_var (TREE_TYPE (bnd));
+      stmt = gimple_build_assign (lhs, PLUS_EXPR, niters,
+				  build_int_cst (TREE_TYPE (bnd), vf - 1));
+      tmp = make_ssa_name (lhs, stmt);
+      gimple_assign_set_lhs (stmt, tmp);
+      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+      /* Replace BND definition with bnd = t >> log2 (vf).  */
+      log_vf = build_int_cst (TREE_TYPE (tmp), exact_log2 (vf));
+      new_def_bnd = gimple_build_assign (bnd, RSHIFT_EXPR, tmp, log_vf);
+      gsi_replace (&gsi, new_def_bnd, false);
+    }
+  else
+    {
+      tree op_const;
+      unsigned n;
+      unsigned logvf = exact_log2 (vf);
+      gcond *cond;
+      gcc_assert (TREE_CODE (ratio_mult_vf) == INTEGER_CST);
+      gcc_assert (TREE_CODE (niters) == INTEGER_CST);
+      /* Change value of bnd in GIMPLE_COND.  */
+      gcc_assert (loop->num_nodes == 2);
+      stmt = last_stmt (loop->header);
+      gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+      n = tree_to_uhwi (niters);
+      n = ((n + (vf - 1)) >> logvf) << logvf;
+      op_const = build_int_cst (TREE_TYPE (gimple_cond_lhs (stmt)), n);
+      gcc_assert (TREE_CODE (gimple_cond_rhs (stmt)) == INTEGER_CST);
+      cond = dyn_cast <gcond *> (stmt);
+      gimple_cond_set_rhs (cond, op_const);
+    }
+}
+
+/* Did scalar remainder unreachable through vecotirzed loop.  */
+
+static void
+isolate_remainder (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  edge e;
+  basic_block bb = loop->header;
+  gimple *last;
+  gcond *cond;
+
+  e = EDGE_SUCC ((bb), 0);
+  if (flow_bb_inside_loop_p (loop, e->dest))
+    e = EDGE_SUCC ((bb), 1);
+  bb = e->dest;
+  gcc_assert (!flow_bb_inside_loop_p (loop, bb));
+  last = last_stmt (bb);
+  gcc_assert (gimple_code (last) == GIMPLE_COND);
+  cond = as_a <gcond *> (last);
+  /* Assume that target of false edge is scalar loop preheader.  */
+  gimple_cond_make_true (cond);
+}
+
+/* Generate induction_vector which will be used to mask evaluation.  */
+
+static tree
+gen_vec_induction (loop_vec_info loop_vinfo, unsigned elem_size, unsigned size)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  edge pe = loop_preheader_edge (loop);
+  vec<constructor_elt, va_gc> *v;
+  gimple *stmt;
+  gimple_stmt_iterator gsi;
+  gphi *induction_phi;
+  tree iv_type, vectype;
+  tree lhs, rhs, iv;
+  unsigned n;
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  int i;
+  tree new_vec, new_var;
+  tree vec_init, vec_step, vec_dest, vec_def;
+  tree val;
+  tree induc_def;
+  basic_block new_bb;
+  machine_mode mode;
+
+  /* Find control iv.  */
+  stmt = last_stmt (loop->header);
+  gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+  lhs = gimple_cond_lhs (stmt);
+  rhs = gimple_cond_rhs (stmt);
+  /* Assume any operand order.  */
+  if (TREE_CODE (lhs) != SSA_NAME)
+    iv = rhs;
+  else
+    {
+      gimple *def_stmt = SSA_NAME_DEF_STMT (lhs);
+      if (gimple_bb (def_stmt) != loop->header)
+	iv = rhs;
+      else
+	iv = lhs;
+    }
+  gcc_assert (TREE_CODE (iv) == SSA_NAME);
+  /* Determine type to build vector index aka induction vector.  */
+  n = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (iv)));
+  if (n > elem_size)
+    /* Multiple types are not yet supported.  */
+    return NULL_TREE;
+  if (n == elem_size && !TYPE_UNSIGNED (TREE_TYPE (iv)))
+    iv_type = TREE_TYPE (iv);
+  else
+    iv_type = build_nonstandard_integer_type (elem_size, 0);
+  vectype = get_vectype_for_scalar_type_and_size (iv_type, size);
+  mode =  TYPE_MODE (vectype);
+  /* Check that vector comparison for IV_TYPE is supported.  */
+  if (get_vcond_icode (mode, mode, 0)== CODE_FOR_nothing)
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "type is not supported for vector compare!\n");
+	  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+	}
+      return NULL_TREE;
+    }
+
+  /* Build induction initialization and insert it to loop preheader.  */
+  vec_alloc (v, vf);
+  for (i = 0; i < vf; i++)
+    {
+      tree elem;
+      elem = build_int_cst (iv_type, i);
+      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elem);
+    }
+  new_vec = build_vector_from_ctor (vectype, v);
+  new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
+  stmt = gimple_build_assign (new_var, new_vec);
+  vec_init = make_ssa_name (new_var, stmt);
+  gimple_assign_set_lhs (stmt, vec_init);
+  new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+  gcc_assert (!new_bb);
+
+  /* Create vector-step consisting from VF.  */
+  val = build_int_cst (iv_type, vf);
+  new_vec = build_vector_from_val (vectype, val);
+  new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
+  stmt = gimple_build_assign (new_var, new_vec);
+  vec_step = make_ssa_name (new_var, stmt);
+  gimple_assign_set_lhs (stmt, vec_step);
+  new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+  gcc_assert (!new_bb);
+
+  /* Create the induction-phi.  */
+  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+  induction_phi = create_phi_node (vec_dest, loop->header);
+  induc_def = PHI_RESULT (induction_phi);
+
+  /* Create vector iv increment inside loop.  */
+  gsi = gsi_after_labels (loop->header);
+  stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
+  vec_def = make_ssa_name (vec_dest, stmt);
+  gimple_assign_set_lhs (stmt, vec_def);
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+
+  /* Set the arguments of phi node.  */
+  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
+  add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop),
+	       UNKNOWN_LOCATION);
+  return induc_def;
+}
+
+/* Produce mask which will be used for masking.  */
+
+static tree
+gen_mask_for_remainder (loop_vec_info loop_vinfo, tree vec_index, unsigned size)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree new_vec, new_var;
+  tree niters, vec_niters, new_niters, vec_res, vec_mask;
+  gimple *stmt;
+  basic_block new_bb;
+  edge pe = loop_preheader_edge (loop);
+  gimple_stmt_iterator gsi;
+  tree vectype = TREE_TYPE (vec_index);
+  tree s_vectype;
+
+  gsi = gsi_after_labels (loop->header);
+  niters = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+	   ? LOOP_VINFO_NITERS (loop_vinfo)
+	   : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo);
+
+  /* Create vector for comparison consisting of niters.  */
+  if (!types_compatible_p (TREE_TYPE (niters), TREE_TYPE (vectype)))
+    {
+      tree new_type = TREE_TYPE (vectype);
+      enum tree_code cop;
+      cop = tree_to_uhwi (TYPE_SIZE (new_type)) ==
+	    tree_to_uhwi (TYPE_SIZE (TREE_TYPE (niters)))
+	    ? NOP_EXPR : CONVERT_EXPR;
+      new_niters = make_ssa_name (new_type);
+      stmt = gimple_build_assign (new_niters, cop, niters);
+      new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+      gcc_assert (!new_bb);
+    }
+  else
+    new_niters = niters;
+  new_vec = build_vector_from_val (vectype, new_niters);
+  new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
+  stmt = gimple_build_assign (new_var, new_vec);
+  vec_niters = make_ssa_name (new_var, stmt);
+  gimple_assign_set_lhs (stmt, vec_niters);
+  new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+  gcc_assert (!new_bb);
+  /* Create vector comparison the result of which will be used as mask
+     for loads/stores.  */
+  if (TYPE_UNSIGNED (vectype))
+    {
+      /* Create signed vectype.  */
+      tree stype = TREE_TYPE (vectype);
+      unsigned sz = tree_to_uhwi (TYPE_SIZE (stype));
+      tree new_type = build_nonstandard_integer_type (sz, 0);
+      s_vectype = get_vectype_for_scalar_type_and_size (new_type, size);
+      gcc_assert (s_vectype);
+    }
+  else
+    s_vectype = vectype;
+  vec_mask = vect_get_new_vect_var (s_vectype, vect_simple_var, "vec_mask_");
+  stmt = gimple_build_assign (vec_mask, LT_EXPR, vec_index, vec_niters);
+  vec_res = make_ssa_name (vec_mask, stmt);
+  gimple_assign_set_lhs (stmt, vec_res);
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+  return vec_res;
+}
+
+/* Convert each load to masked load.  */
+
+static void
+convert_loads_to_masked (vec<gimple *> *loads, tree mask)
+{
+  gimple *stmt, *new_stmt;
+  tree addr, ref;
+  gimple_stmt_iterator gsi;
+
+  while (loads->length () > 0)
+    {
+      tree lhs, ptr;
+      stmt = loads->pop ();
+      gsi = gsi_for_stmt (stmt);
+      lhs = gimple_assign_lhs (stmt);
+      ref = gimple_assign_rhs1 (stmt);
+      addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
+				       true, NULL_TREE, true,
+				       GSI_SAME_STMT);
+      ptr = build_int_cst (reference_alias_ptr_type (ref), 0);
+      if (!SSA_NAME_PTR_INFO (addr))
+	copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr), ref);
+      new_stmt = gimple_build_call_internal (IFN_MASK_LOAD, 3,
+					     addr, ptr, mask);
+      gimple_call_set_lhs (new_stmt, lhs);
+      gsi_replace (&gsi, new_stmt, false);
+    }
+}
+
+/* Convert each store to masked one.  */
+
+static void
+convert_stores_to_masked (vec<gimple *> *stores, tree mask)
+{
+  gimple *stmt, *new_stmt;
+  tree addr, ref;
+  gimple_stmt_iterator gsi;
+
+  while (stores->length () > 0)
+    {
+      tree rhs, ptr;
+      stmt = stores->pop ();
+      gsi = gsi_for_stmt (stmt);
+      ref = gimple_assign_lhs (stmt);
+      rhs = gimple_assign_rhs1 (stmt);
+      addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
+				       true, NULL_TREE, true,
+				       GSI_SAME_STMT);
+      ptr = build_int_cst (reference_alias_ptr_type (ref), 0);
+      if (!SSA_NAME_PTR_INFO (addr))
+	copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr), ref);
+      new_stmt = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+					      mask, rhs);
+      gsi_replace (&gsi, new_stmt, false);
+    }
+}
+
+static void
+fix_mask_for_masked_ld_st (vec<gimple *> *masked_stmt, tree mask)
+{
+  gimple *stmt, *new_stmt;
+  tree old, lhs, vectype, var, n_lhs;
+  gimple_stmt_iterator gsi;
+
+  while (masked_stmt->length () > 0)
+    {
+      stmt = masked_stmt->pop ();
+      gsi = gsi_for_stmt (stmt);
+      old = gimple_call_arg (stmt, 2);
+      vectype = TREE_TYPE (old);
+      if (TREE_TYPE (mask) != vectype)
+	{
+	  tree new_vtype = TREE_TYPE (mask);
+	  tree n_var;
+	  tree conv_expr;
+	  n_var = vect_get_new_vect_var (new_vtype, vect_simple_var, NULL);
+	  conv_expr = build1 (VIEW_CONVERT_EXPR, new_vtype, old);
+	  new_stmt = gimple_build_assign (n_var, conv_expr);
+	  n_lhs = make_ssa_name (n_var);
+	  gimple_assign_set_lhs (new_stmt, n_lhs);
+	  vectype = new_vtype;
+	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
+	}
+      else
+	n_lhs = old;
+      var = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
+      new_stmt = gimple_build_assign (var, BIT_AND_EXPR, mask, n_lhs);
+      lhs = make_ssa_name (var, new_stmt);
+      gimple_assign_set_lhs (new_stmt, lhs);
+      gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
+      gimple_call_set_arg (stmt, 2, lhs);
+      update_stmt (stmt);
+    }
+}
+
+/* Convert vectorized reductions to VEC_COND statements to preserve
+   reduction semantic:
+	s1 = x + s2 --> t = x + s2; s1 = (mask)? t : s2.  */
+
+static void
+convert_reductions (loop_vec_info loop_vinfo, tree mask)
+{
+  unsigned i;
+  for (i = 0; i < LOOP_VINFO_REDUCTIONS (loop_vinfo).length (); i++)
+    {
+      gimple *stmt = LOOP_VINFO_REDUCTIONS (loop_vinfo)[i];
+      gimple_stmt_iterator gsi;
+      tree vectype;
+      tree lhs, rhs;
+      tree var, new_lhs, vec_cond_expr;
+      gimple *new_stmt, *def;
+      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+      stmt = STMT_VINFO_VEC_STMT (stmt_info);
+      lhs = gimple_assign_lhs (stmt);
+      vectype = TREE_TYPE (lhs);
+      gsi = gsi_for_stmt (stmt);
+      rhs = gimple_assign_rhs1 (stmt);
+      gcc_assert (TREE_CODE (rhs) == SSA_NAME);
+      def = SSA_NAME_DEF_STMT (rhs);
+      if (gimple_code (def) != GIMPLE_PHI)
+	{
+	  rhs = gimple_assign_rhs2 (stmt);
+	  gcc_assert (TREE_CODE (rhs) == SSA_NAME);
+	  def = SSA_NAME_DEF_STMT (rhs);
+	  gcc_assert (gimple_code (def) == GIMPLE_PHI);
+	}
+      /* Change lhs of STMT.  */
+      var = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
+      new_lhs = make_ssa_name (var, stmt);
+      gimple_assign_set_lhs (stmt, new_lhs);
+      /* Generate new VEC_COND expr.  */
+      vec_cond_expr = build3 (VEC_COND_EXPR, vectype, mask, new_lhs, rhs);
+      new_stmt = gimple_build_assign (lhs, vec_cond_expr);
+      gsi_insert_after (&gsi, new_stmt, GSI_SAME_STMT);
+    }
+}
+
+/* Return true if MEM_REF is incremented by vector size and false otherwise.  */
+
+static bool
+mem_ref_is_vec_size_incremented (loop_vec_info loop_vinfo, tree lhs)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree vectype = TREE_TYPE (lhs);
+  unsigned n = GET_MODE_SIZE (TYPE_MODE (vectype));
+  gphi *phi;
+  edge e = loop_latch_edge (loop);
+  tree arg;
+  gimple *def;
+  tree name;
+  if (TREE_CODE (lhs) != MEM_REF)
+    return false;
+  name = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (name) != SSA_NAME)
+    return false;
+  def = SSA_NAME_DEF_STMT (name);
+  if (!def || gimple_code (def) != GIMPLE_PHI)
+    return false;
+  phi = as_a <gphi *> (def);
+  arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
+  gcc_assert (TREE_CODE (arg) == SSA_NAME);
+  def = SSA_NAME_DEF_STMT (arg);
+  if (gimple_code (def) != GIMPLE_ASSIGN
+      || gimple_assign_rhs_code (def) != POINTER_PLUS_EXPR)
+    return false;
+  arg = gimple_assign_rhs2 (def);
+  if (TREE_CODE (arg) != INTEGER_CST)
+    arg = gimple_assign_rhs1 (def);
+  if (TREE_CODE (arg) != INTEGER_CST)
+    return false;
+  if (compare_tree_int (arg, n) != 0)
+    return false;
+  return true;
+}
+
+/* Combine vectorized loop with scalar remainder through masking statemnts
+   such as memoryt read/write and reduction to produce legal result.
+   New vector inductive variable is created to generate mask which simply is
+   result of compare new variable with vector containing a number of iteration.
+   Loop tripe count is adjusted and scalar loop correspondent to remainder
+   is made unreachable through vectorized loop.  */
+
+void
+combine_vect_loop_remainder (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  auto_vec<gimple *, 10> loads;
+  auto_vec<gimple *, 5> stores;
+  auto_vec<gimple *, 5> masked_ld_st;
+  int elem_size = 0;
+  int n;
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+  gimple *stmt;
+  stmt_vec_info stmt_info;
+  tree lhs, rhs, vectype;
+  tree vec_index, vec_mask;
+  bool has_reductions = false;
+  unsigned size = 0;
+
+  if (!loop)
+    return;
+  if (loop->inner)
+    return;  /* do not support outer-loop vectorization.  */
+  gcc_assert (LOOP_VINFO_VECTORIZABLE_P (loop_vinfo));
+  vect_location = find_loop_location (loop);
+  if (!LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    return;
+  if (!LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()
+      || !LOOP_VINFO_GROUPED_STORES (loop_vinfo).is_empty ())
+    return;
+  bb = loop->header;
+  /* Collect all loads and stores.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      stmt_info = vinfo_for_stmt (stmt);
+      if (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	/* Not supported yet!  */
+	return;
+      /* Check that we support given define type.  */
+      if (stmt_info)
+	switch (STMT_VINFO_DEF_TYPE (stmt_info))
+	  {
+	    case vect_induction_def:
+	      if (STMT_VINFO_LIVE_P (stmt_info))
+		return;
+	      break;
+	    case vect_nested_cycle:
+	    case vect_double_reduction_def:
+	    case vect_external_def:
+	      return;
+	    default:
+	      break;
+	  }
+
+      if (gimple_assign_load_p (stmt))
+	{
+	  lhs = gimple_assign_lhs (stmt);
+	  rhs = gimple_assign_rhs1 (stmt);
+	  vectype = TREE_TYPE (lhs);
+	  if (may_be_nonaddressable_p (rhs))
+	    return;
+	  if (!VECTOR_TYPE_P (vectype))
+	    {
+	      struct data_reference *dr;
+	      if (!stmt_info)
+		continue;
+	      dr = STMT_VINFO_DATA_REF (stmt_info);
+	      if (!dr)
+		continue;
+	      if (TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
+		return;
+	      if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) <= 0)
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				 "Load with decrement is not masked.\n");
+		  return;
+		}
+	      continue;
+	    }
+	  if (vf / TYPE_VECTOR_SUBPARTS (vectype) > 1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  n = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	    elem_size = n;
+	  else if (n != elem_size)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  if (!can_vec_mask_load_store_p (TYPE_MODE (vectype), true))
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "type is not supported for masking!\n");
+		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+		}
+	      return;
+	    }
+	  loads.safe_push (stmt);
+	}
+      else if (gimple_store_p (stmt))
+	{
+	  gcc_assert (gimple_assign_single_p (stmt));
+	  lhs = gimple_assign_lhs (stmt);
+	  if (may_be_nonaddressable_p (lhs))
+	    return;
+	  vectype = TREE_TYPE (lhs);
+	  if (!VECTOR_TYPE_P (vectype))
+	    continue;
+	  if (vf / TYPE_VECTOR_SUBPARTS (vectype) > 1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  n = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	      elem_size = n;
+	  else if (n != elem_size)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  if (!mem_ref_is_vec_size_incremented (loop_vinfo, lhs))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Store with decrement is not masked.\n");
+	      return;
+	    }
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  if (!can_vec_mask_load_store_p (TYPE_MODE (vectype), false))
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "type is not supported for masking!\n");
+		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+		}
+	      return;
+	    }
+	  stores.safe_push (stmt);
+	}
+      else if (is_gimple_call (stmt)
+	       && gimple_call_internal_p (stmt)
+	       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+		   || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
+	/* Need to figure out what is vectype for new mask.  */
+	masked_ld_st.safe_push (stmt);
+      else if (is_gimple_call (stmt))
+	return;
+    }
+
+  /* Check that all vectorizable reductions can be converted to VCOND.  */
+  if (!LOOP_VINFO_REDUCTIONS (loop_vinfo).is_empty ())
+    {
+      unsigned i;
+      has_reductions = true;
+      for (i = 0; i < LOOP_VINFO_REDUCTIONS (loop_vinfo).length (); i++)
+	{
+	  machine_mode mode;
+
+	  stmt = LOOP_VINFO_REDUCTIONS (loop_vinfo)[i];
+	  stmt_info = vinfo_for_stmt (stmt);
+	  gcc_assert (stmt_info);
+	  if (PURE_SLP_STMT (stmt_info))
+	    return;
+	  gcc_assert (STMT_VINFO_VEC_STMT (stmt_info));
+	  stmt = STMT_VINFO_VEC_STMT (stmt_info);
+	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
+	    return;
+	  /* Only reduction with binary operation is supported.  */
+	  if (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))
+	      != GIMPLE_BINARY_RHS)
+	    return;
+	  lhs = gimple_assign_lhs (stmt);
+	  vectype = TREE_TYPE (lhs);
+	  if (vf / TYPE_VECTOR_SUBPARTS (vectype) > 1)
+	    /* Not yet supported!  */
+	    return;
+	  n = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	    elem_size = n;
+	  else if (n != elem_size)
+	    /* Not yet supported!  */
+	    return;
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  mode = TYPE_MODE (vectype);
+	  if (get_vcond_icode (mode, mode, TYPE_UNSIGNED (vectype))
+	      == CODE_FOR_nothing)
+	    return;
+	}
+    }
+  /* Check masked load/stores is any.  */
+  if (!masked_ld_st.is_empty ())
+    {
+      unsigned i;
+      for (i = 0; i < masked_ld_st.length (); i++)
+	{
+	  tree mask;
+	  tree vectype;
+	  optab tab;
+	  stmt = masked_ld_st[i];
+	  mask = gimple_call_arg (stmt, 2);
+	  vectype = TREE_TYPE (mask);
+	  n = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	    elem_size = n;
+	  else if (n != elem_size)
+	    /* Mask conversion is not supported yet!  */
+	    return;
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  /* Check that BIT_AND is supported on target.  */
+	  tab = optab_for_tree_code (BIT_AND_EXPR, vectype, optab_default);
+	  if (!tab)
+	    return;
+	  if (optab_handler (tab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
+	    return;
+	}
+    }
+
+  /* Generate induction vector which will be used to evaluate mask.  */
+  vec_index = gen_vec_induction (loop_vinfo, elem_size, size);
+  if (!vec_index)
+    return;
+
+  /* Generate mask vector which will be used to nask saved statements.  */
+  vec_mask = gen_mask_for_remainder (loop_vinfo, vec_index, size);
+  gcc_assert (vec_mask);
+
+  /* Convert vectororized loads to masked ones.  */
+  if (!loads.is_empty ())
+    convert_loads_to_masked (&loads, vec_mask);
+
+  /* Convert vectoirizzed stores to masked ones.  */
+  if (!stores.is_empty ())
+    convert_stores_to_masked (&stores, vec_mask);
+
+  if (has_reductions)
+    convert_reductions (loop_vinfo, vec_mask);
+
+  if (!masked_ld_st.is_empty ())
+    fix_mask_for_masked_ld_st (&masked_ld_st, vec_mask);
+
+  /* Fix loop trip count.  */
+  fix_vec_loop_trip_count (loop_vinfo);
+
+  /* Fix up cfg to make scalar loop remainder unreachable.  */
+  isolate_remainder (loop_vinfo);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "=== scalar remainder has been deleted ===\n");
+}
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 3e6fd35..f7366c1 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -559,6 +559,18 @@ vectorize_loops (void)
 	  }
       }
 
+  /* Try to combine vectorized loop and scalar remainder.  */
+  for (i = 1; i < vect_loops_num; i++)
+    {
+      loop_vec_info loop_vinfo;
+      loop = get_loop (cfun, i);
+      if (!loop || loop->inner)
+	continue;
+      loop_vinfo = (loop_vec_info) loop->aux;
+      if (loop_vinfo && LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
+	combine_vect_loop_remainder (loop_vinfo);
+    }
+
   for (i = 1; i < vect_loops_num; i++)
     {
       loop_vec_info loop_vinfo;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bf01ded..e8865bc 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -230,6 +230,8 @@ typedef struct _loop_vec_info : public vec_info {
   tree num_iters;
   /* Number of iterations of the original loop.  */
   tree num_iters_unchanged;
+  /* Number of iteration of vectorized loop.  */
+  tree num_iters_vect_loop;
 
   /* Threshold of number of iterations below which vectorzation will not be
      performed. It is calculated from MIN_PROFITABLE_ITERS and
@@ -335,6 +337,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_BBS(L)                  (L)->bbs
 #define LOOP_VINFO_NITERSM1(L)             (L)->num_itersm1
 #define LOOP_VINFO_NITERS(L)               (L)->num_iters
+#define LOOP_VINFO_NITERS_VECT_LOOP(L)    (L)->num_iters_vect_loop
 /* Since LOOP_VINFO_NITERS and LOOP_VINFO_NITERSM1 can change after
    prologue peeling retain total unchanged scalar loop iterations for
    cost model.  */
@@ -994,6 +997,7 @@ extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *,
 			       vec<tree> *, slp_tree, int);
 extern tree vect_gen_perm_mask_any (tree, const unsigned char *);
 extern tree vect_gen_perm_mask_checked (tree, const unsigned char *);
+extern void combine_vect_loop_remainder (loop_vec_info);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-02-09 16:18 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-28 10:57 [RFC] Combine vectorized loops with its scalar remainder Yuri Rumyantsev
2015-11-03 10:08 ` Richard Henderson
2015-11-03 10:35   ` Yuri Rumyantsev
2015-11-03 11:47 ` Richard Biener
2015-11-03 12:08   ` Yuri Rumyantsev
2015-11-10 12:30     ` Richard Biener
2015-11-10 13:02       ` Ilya Enkovich
2015-11-10 14:52         ` Richard Biener
2015-11-13 10:36           ` Yuri Rumyantsev
2015-11-23 15:54             ` Yuri Rumyantsev
2015-11-24  9:21               ` Richard Biener
2015-11-27 13:49             ` Richard Biener
2015-11-30 15:04               ` Yuri Rumyantsev
2015-12-15 16:41                 ` Yuri Rumyantsev
2016-01-11 10:07                   ` Yuri Rumyantsev
2016-02-09 16:10                   ` Ilya Enkovich
2016-02-09 16:18                     ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).