public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest
@ 2023-08-14  8:54 Kewen.Lin
  2023-08-14  8:59 ` [PATCH] vect: Move VMAT_GATHER_SCATTER " Kewen.Lin
  2023-08-14 12:04 ` [PATCH] vect: Move VMAT_LOAD_STORE_LANES " Richard Biener
  0 siblings, 2 replies; 18+ messages in thread
From: Kewen.Lin @ 2023-08-14  8:54 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Biener, Richard Sandiford

Hi,

Following Richi's suggestion [1], this patch is to move the
handlings on VMAT_LOAD_STORE_LANES in the final loop nest
of function vectorizable_load to its own loop.  Basically
it duplicates the final loop nest, clean up some useless
set up code for the case of VMAT_LOAD_STORE_LANES, remove
some unreachable code.  Also remove the corresponding
handlings in the final loop nest.

Bootstrapped and regtested on x86_64-redhat-linux,
aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
	VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
	and update the final nest accordingly.
---
 gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++--------------------
 1 file changed, 634 insertions(+), 641 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4f2d088484c..c361e16cb7b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
 	vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
 				       &vec_masks, mask_vectype);
     }
+
   tree vec_mask = NULL_TREE;
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
+    {
+      gcc_assert (alignment_support_scheme == dr_aligned
+		  || alignment_support_scheme == dr_unaligned_supported);
+      gcc_assert (grouped_load && !slp);
+
+      unsigned int inside_cost = 0, prologue_cost = 0;
+      for (j = 0; j < ncopies; j++)
+	{
+	  if (costing_p)
+	    {
+	      /* An IFN_LOAD_LANES will load all its vector results,
+		 regardless of which ones we actually need.  Account
+		 for the cost of unused results.  */
+	      if (first_stmt_info == stmt_info)
+		{
+		  unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
+		  stmt_vec_info next_stmt_info = first_stmt_info;
+		  do
+		    {
+		      gaps -= 1;
+		      next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
+		    }
+		  while (next_stmt_info);
+		  if (gaps)
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "vect_model_load_cost: %d "
+					 "unused vectors.\n",
+					 gaps);
+		      vect_get_load_cost (vinfo, stmt_info, gaps,
+					  alignment_support_scheme,
+					  misalignment, false, &inside_cost,
+					  &prologue_cost, cost_vec, cost_vec,
+					  true);
+		    }
+		}
+	      vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
+				  misalignment, false, &inside_cost,
+				  &prologue_cost, cost_vec, cost_vec, true);
+	      continue;
+	    }
+
+	  /* 1. Create the vector or array pointer update chain.  */
+	  if (j == 0)
+	    dataref_ptr
+	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+					  at_loop, offset, &dummy, gsi,
+					  &ptr_incr, false, bump);
+	  else
+	    {
+	      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+	      dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+					     stmt_info, bump);
+	    }
+	  if (mask)
+	    vec_mask = vec_masks[j];
+
+	  tree vec_array = create_vector_array (vectype, vec_num);
+
+	  tree final_mask = NULL_TREE;
+	  if (loop_masks)
+	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+					     ncopies, vectype, j);
+	  if (vec_mask)
+	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
+					   vec_mask, gsi);
+
+	  gcall *call;
+	  if (final_mask)
+	    {
+	      /* Emit:
+		   VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
+						VEC_MASK).  */
+	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+	      tree alias_ptr = build_int_cst (ref_type, align);
+	      call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
+						 dataref_ptr, alias_ptr,
+						 final_mask);
+	    }
+	  else
+	    {
+	      /* Emit:
+		   VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
+	      data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
+	      call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
+	    }
+	  gimple_call_set_lhs (call, vec_array);
+	  gimple_call_set_nothrow (call, true);
+	  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+
+	  dr_chain.create (vec_num);
+	  /* Extract each vector into an SSA_NAME.  */
+	  for (i = 0; i < vec_num; i++)
+	    {
+	      new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
+					    vec_array, i);
+	      dr_chain.quick_push (new_temp);
+	    }
+
+	  /* Record the mapping between SSA_NAMEs and statements.  */
+	  vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
+
+	  /* Record that VEC_ARRAY is now dead.  */
+	  vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
+
+	  dr_chain.release ();
+
+	  *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+	}
+
+      if (costing_p && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "vect_model_load_cost: inside_cost = %u, "
+			 "prologue_cost = %u .\n",
+			 inside_cost, prologue_cost);
+
+      return true;
+    }
+
   poly_uint64 group_elt = 0;
   unsigned int inside_cost = 0, prologue_cost = 0;
   for (j = 0; j < ncopies; j++)
@@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo,
 	dr_chain.create (vec_num);

       gimple *new_stmt = NULL;
-      if (memory_access_type == VMAT_LOAD_STORE_LANES)
+      for (i = 0; i < vec_num; i++)
 	{
-	  if (costing_p)
-	    {
-	      /* An IFN_LOAD_LANES will load all its vector results,
-		 regardless of which ones we actually need.  Account
-		 for the cost of unused results.  */
-	      if (grouped_load && first_stmt_info == stmt_info)
-		{
-		  unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
-		  stmt_vec_info next_stmt_info = first_stmt_info;
-		  do
-		    {
-		      gaps -= 1;
-		      next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
-		    }
-		  while (next_stmt_info);
-		  if (gaps)
-		    {
-		      if (dump_enabled_p ())
-			dump_printf_loc (MSG_NOTE, vect_location,
-					 "vect_model_load_cost: %d "
-					 "unused vectors.\n",
-					 gaps);
-		      vect_get_load_cost (vinfo, stmt_info, gaps,
-					  alignment_support_scheme,
-					  misalignment, false, &inside_cost,
-					  &prologue_cost, cost_vec, cost_vec,
-					  true);
-		    }
-		}
-	      vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
-				  misalignment, false, &inside_cost,
-				  &prologue_cost, cost_vec, cost_vec, true);
-	      continue;
-	    }
-	  tree vec_array;
-
-	  vec_array = create_vector_array (vectype, vec_num);
-
 	  tree final_mask = NULL_TREE;
-	  if (loop_masks)
-	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-					     ncopies, vectype, j);
-	  if (vec_mask)
-	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
-					   final_mask, vec_mask, gsi);
-
-	  gcall *call;
-	  if (final_mask)
-	    {
-	      /* Emit:
-		   VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
-		                                VEC_MASK).  */
-	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
-	      tree alias_ptr = build_int_cst (ref_type, align);
-	      call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
-						 dataref_ptr, alias_ptr,
-						 final_mask);
-	    }
-	  else
+	  tree final_len = NULL_TREE;
+	  tree bias = NULL_TREE;
+	  if (!costing_p)
 	    {
-	      /* Emit:
-		   VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
-	      data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
-	      call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
-	    }
-	  gimple_call_set_lhs (call, vec_array);
-	  gimple_call_set_nothrow (call, true);
-	  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
-	  new_stmt = call;
+	      if (loop_masks)
+		final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i);
+	      if (vec_mask)
+		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+					       final_mask, vec_mask, gsi);

-	  /* Extract each vector into an SSA_NAME.  */
-	  for (i = 0; i < vec_num; i++)
-	    {
-	      new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
-					    vec_array, i);
-	      dr_chain.quick_push (new_temp);
+	      if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+					       gsi, stmt_info, bump);
 	    }

-	  /* Record the mapping between SSA_NAMEs and statements.  */
-	  vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
-
-	  /* Record that VEC_ARRAY is now dead.  */
-	  vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
-	}
-      else
-	{
-	  for (i = 0; i < vec_num; i++)
+	  /* 2. Create the vector-load in the loop.  */
+	  switch (alignment_support_scheme)
 	    {
-	      tree final_mask = NULL_TREE;
-	      tree final_len = NULL_TREE;
-	      tree bias = NULL_TREE;
-	      if (!costing_p)
-		{
-		  if (loop_masks)
-		    final_mask
-		      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-					    vec_num * ncopies, vectype,
-					    vec_num * j + i);
-		  if (vec_mask)
-		    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
-						   final_mask, vec_mask, gsi);
-
-		  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-		    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
-						   gsi, stmt_info, bump);
-		}
+	    case dr_aligned:
+	    case dr_unaligned_supported:
+	      {
+		unsigned int misalign;
+		unsigned HOST_WIDE_INT align;

-	      /* 2. Create the vector-load in the loop.  */
-	      switch (alignment_support_scheme)
-		{
-		case dr_aligned:
-		case dr_unaligned_supported:
+		if (memory_access_type == VMAT_GATHER_SCATTER
+		    && gs_info.ifn != IFN_LAST)
 		  {
-		    unsigned int misalign;
-		    unsigned HOST_WIDE_INT align;
-
-		    if (memory_access_type == VMAT_GATHER_SCATTER
-			&& gs_info.ifn != IFN_LAST)
+		    if (costing_p)
 		      {
-			if (costing_p)
-			  {
-			    unsigned int cnunits
-			      = vect_nunits_for_cost (vectype);
-			    inside_cost
-			      = record_stmt_cost (cost_vec, cnunits,
-						  scalar_load, stmt_info, 0,
-						  vect_body);
-			    break;
-			  }
-			if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-			  vec_offset = vec_offsets[vec_num * j + i];
-			tree zero = build_zero_cst (vectype);
-			tree scale = size_int (gs_info.scale);
-
-			if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
-			  {
-			    if (loop_lens)
-			      final_len
-				= vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-						     vec_num * ncopies, vectype,
-						     vec_num * j + i, 1);
-			    else
-			      final_len = build_int_cst (sizetype,
-							 TYPE_VECTOR_SUBPARTS (
-							   vectype));
-			    signed char biasval
-			      = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-			    bias = build_int_cst (intQI_type_node, biasval);
-			    if (!final_mask)
-			      {
-				mask_vectype = truth_type_for (vectype);
-				final_mask = build_minus_one_cst (mask_vectype);
-			      }
-			  }
-
-			gcall *call;
-			if (final_len && final_mask)
-			  call = gimple_build_call_internal (
-			    IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr,
-			    vec_offset, scale, zero, final_mask, final_len,
-			    bias);
-			else if (final_mask)
-			  call = gimple_build_call_internal
-			    (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
-			     vec_offset, scale, zero, final_mask);
-			else
-			  call = gimple_build_call_internal
-			    (IFN_GATHER_LOAD, 4, dataref_ptr,
-			     vec_offset, scale, zero);
-			gimple_call_set_nothrow (call, true);
-			new_stmt = call;
-			data_ref = NULL_TREE;
+			unsigned int cnunits = vect_nunits_for_cost (vectype);
+			inside_cost
+			  = record_stmt_cost (cost_vec, cnunits, scalar_load,
+					      stmt_info, 0, vect_body);
 			break;
 		      }
-		    else if (memory_access_type == VMAT_GATHER_SCATTER)
+		    if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		      vec_offset = vec_offsets[vec_num * j + i];
+		    tree zero = build_zero_cst (vectype);
+		    tree scale = size_int (gs_info.scale);
+
+		    if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
 		      {
-			/* Emulated gather-scatter.  */
-			gcc_assert (!final_mask);
-			unsigned HOST_WIDE_INT const_nunits
-			  = nunits.to_constant ();
-			if (costing_p)
-			  {
-			    /* For emulated gathers N offset vector element
-			       offset add is consumed by the load).  */
-			    inside_cost
-			      = record_stmt_cost (cost_vec, const_nunits,
-						  vec_to_scalar, stmt_info, 0,
-						  vect_body);
-			    /* N scalar loads plus gathering them into a
-			       vector.  */
-			    inside_cost
-			      = record_stmt_cost (cost_vec, const_nunits,
-						  scalar_load, stmt_info, 0,
-						  vect_body);
-			    inside_cost
-			      = record_stmt_cost (cost_vec, 1, vec_construct,
-						  stmt_info, 0, vect_body);
-			    break;
-			  }
-			unsigned HOST_WIDE_INT const_offset_nunits
-			  = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
-			      .to_constant ();
-			vec<constructor_elt, va_gc> *ctor_elts;
-			vec_alloc (ctor_elts, const_nunits);
-			gimple_seq stmts = NULL;
-			/* We support offset vectors with more elements
-			   than the data vector for now.  */
-			unsigned HOST_WIDE_INT factor
-			  = const_offset_nunits / const_nunits;
-			vec_offset = vec_offsets[j / factor];
-			unsigned elt_offset = (j % factor) * const_nunits;
-			tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
-			tree scale = size_int (gs_info.scale);
-			align
-			  = get_object_alignment (DR_REF (first_dr_info->dr));
-			tree ltype = build_aligned_type (TREE_TYPE (vectype),
-							 align);
-			for (unsigned k = 0; k < const_nunits; ++k)
+			if (loop_lens)
+			  final_len
+			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i, 1);
+			else
+			  final_len
+			    = build_int_cst (sizetype,
+					     TYPE_VECTOR_SUBPARTS (vectype));
+			signed char biasval
+			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+			bias = build_int_cst (intQI_type_node, biasval);
+			if (!final_mask)
 			  {
-			    tree boff = size_binop (MULT_EXPR,
-						    TYPE_SIZE (idx_type),
-						    bitsize_int
-						      (k + elt_offset));
-			    tree idx = gimple_build (&stmts, BIT_FIELD_REF,
-						     idx_type, vec_offset,
-						     TYPE_SIZE (idx_type),
-						     boff);
-			    idx = gimple_convert (&stmts, sizetype, idx);
-			    idx = gimple_build (&stmts, MULT_EXPR,
-						sizetype, idx, scale);
-			    tree ptr = gimple_build (&stmts, PLUS_EXPR,
-						     TREE_TYPE (dataref_ptr),
-						     dataref_ptr, idx);
-			    ptr = gimple_convert (&stmts, ptr_type_node, ptr);
-			    tree elt = make_ssa_name (TREE_TYPE (vectype));
-			    tree ref = build2 (MEM_REF, ltype, ptr,
-					       build_int_cst (ref_type, 0));
-			    new_stmt = gimple_build_assign (elt, ref);
-			    gimple_set_vuse (new_stmt,
-					     gimple_vuse (gsi_stmt (*gsi)));
-			    gimple_seq_add_stmt (&stmts, new_stmt);
-			    CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
+			    mask_vectype = truth_type_for (vectype);
+			    final_mask = build_minus_one_cst (mask_vectype);
 			  }
-			gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-			new_stmt = gimple_build_assign (NULL_TREE,
-							build_constructor
-							  (vectype, ctor_elts));
-			data_ref = NULL_TREE;
-			break;
 		      }

-		    if (costing_p)
-		      break;
-
-		    align =
-		      known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
-		    if (alignment_support_scheme == dr_aligned)
-		      misalign = 0;
-		    else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
-		      {
-			align = dr_alignment
-			  (vect_dr_behavior (vinfo, first_dr_info));
-			misalign = 0;
-		      }
+		    gcall *call;
+		    if (final_len && final_mask)
+		      call = gimple_build_call_internal (
+			IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+			scale, zero, final_mask, final_len, bias);
+		    else if (final_mask)
+		      call
+			= gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
+						      dataref_ptr, vec_offset,
+						      scale, zero, final_mask);
 		    else
-		      misalign = misalignment;
-		    if (dataref_offset == NULL_TREE
-			&& TREE_CODE (dataref_ptr) == SSA_NAME)
-		      set_ptr_info_alignment (get_ptr_info (dataref_ptr),
-					      align, misalign);
-		    align = least_bit_hwi (misalign | align);
-
-		    /* Compute IFN when LOOP_LENS or final_mask valid.  */
-		    machine_mode vmode = TYPE_MODE (vectype);
-		    machine_mode new_vmode = vmode;
-		    internal_fn partial_ifn = IFN_LAST;
-		    if (loop_lens)
+		      call
+			= gimple_build_call_internal (IFN_GATHER_LOAD, 4,
+						      dataref_ptr, vec_offset,
+						      scale, zero);
+		    gimple_call_set_nothrow (call, true);
+		    new_stmt = call;
+		    data_ref = NULL_TREE;
+		    break;
+		  }
+		else if (memory_access_type == VMAT_GATHER_SCATTER)
+		  {
+		    /* Emulated gather-scatter.  */
+		    gcc_assert (!final_mask);
+		    unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+		    if (costing_p)
 		      {
-			opt_machine_mode new_ovmode
-			  = get_len_load_store_mode (vmode, true,
-						     &partial_ifn);
-			new_vmode = new_ovmode.require ();
-			unsigned factor = (new_ovmode == vmode)
-					    ? 1
-					    : GET_MODE_UNIT_SIZE (vmode);
-			final_len
-			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-					       vec_num * ncopies, vectype,
-					       vec_num * j + i, factor);
+			/* For emulated gathers N offset vector element
+			   offset add is consumed by the load).  */
+			inside_cost
+			  = record_stmt_cost (cost_vec, const_nunits,
+					      vec_to_scalar, stmt_info, 0,
+					      vect_body);
+			/* N scalar loads plus gathering them into a
+			   vector.  */
+			inside_cost = record_stmt_cost (cost_vec, const_nunits,
+							scalar_load, stmt_info,
+							0, vect_body);
+			inside_cost
+			  = record_stmt_cost (cost_vec, 1, vec_construct,
+					      stmt_info, 0, vect_body);
+			break;
 		      }
-		    else if (final_mask)
+		    unsigned HOST_WIDE_INT const_offset_nunits
+		      = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
+			  .to_constant ();
+		    vec<constructor_elt, va_gc> *ctor_elts;
+		    vec_alloc (ctor_elts, const_nunits);
+		    gimple_seq stmts = NULL;
+		    /* We support offset vectors with more elements
+		       than the data vector for now.  */
+		    unsigned HOST_WIDE_INT factor
+		      = const_offset_nunits / const_nunits;
+		    vec_offset = vec_offsets[j / factor];
+		    unsigned elt_offset = (j % factor) * const_nunits;
+		    tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+		    tree scale = size_int (gs_info.scale);
+		    align = get_object_alignment (DR_REF (first_dr_info->dr));
+		    tree ltype
+		      = build_aligned_type (TREE_TYPE (vectype), align);
+		    for (unsigned k = 0; k < const_nunits; ++k)
 		      {
-			if (!can_vec_mask_load_store_p (
-			      vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
-			      &partial_ifn))
-			  gcc_unreachable ();
+			tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+						bitsize_int (k + elt_offset));
+			tree idx = gimple_build (&stmts, BIT_FIELD_REF,
+						 idx_type, vec_offset,
+						 TYPE_SIZE (idx_type), boff);
+			idx = gimple_convert (&stmts, sizetype, idx);
+			idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
+					    scale);
+			tree ptr = gimple_build (&stmts, PLUS_EXPR,
+						 TREE_TYPE (dataref_ptr),
+						 dataref_ptr, idx);
+			ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+			tree elt = make_ssa_name (TREE_TYPE (vectype));
+			tree ref = build2 (MEM_REF, ltype, ptr,
+					   build_int_cst (ref_type, 0));
+			new_stmt = gimple_build_assign (elt, ref);
+			gimple_set_vuse (new_stmt,
+					 gimple_vuse (gsi_stmt (*gsi)));
+			gimple_seq_add_stmt (&stmts, new_stmt);
+			CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
 		      }
+		    gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+		    new_stmt = gimple_build_assign (
+		      NULL_TREE, build_constructor (vectype, ctor_elts));
+		    data_ref = NULL_TREE;
+		    break;
+		  }

-		    if (partial_ifn == IFN_MASK_LEN_LOAD)
+		if (costing_p)
+		  break;
+
+		align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
+		if (alignment_support_scheme == dr_aligned)
+		  misalign = 0;
+		else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
+		  {
+		    align
+		      = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
+		    misalign = 0;
+		  }
+		else
+		  misalign = misalignment;
+		if (dataref_offset == NULL_TREE
+		    && TREE_CODE (dataref_ptr) == SSA_NAME)
+		  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
+					  misalign);
+		align = least_bit_hwi (misalign | align);
+
+		/* Compute IFN when LOOP_LENS or final_mask valid.  */
+		machine_mode vmode = TYPE_MODE (vectype);
+		machine_mode new_vmode = vmode;
+		internal_fn partial_ifn = IFN_LAST;
+		if (loop_lens)
+		  {
+		    opt_machine_mode new_ovmode
+		      = get_len_load_store_mode (vmode, true, &partial_ifn);
+		    new_vmode = new_ovmode.require ();
+		    unsigned factor
+		      = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
+		    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						   vec_num * ncopies, vectype,
+						   vec_num * j + i, factor);
+		  }
+		else if (final_mask)
+		  {
+		    if (!can_vec_mask_load_store_p (
+			  vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
+			  &partial_ifn))
+		      gcc_unreachable ();
+		  }
+
+		if (partial_ifn == IFN_MASK_LEN_LOAD)
+		  {
+		    if (!final_len)
 		      {
-			if (!final_len)
-			  {
-			    /* Pass VF value to 'len' argument of
-			       MASK_LEN_LOAD if LOOP_LENS is invalid.  */
-			    final_len
-			      = size_int (TYPE_VECTOR_SUBPARTS (vectype));
-			  }
-			if (!final_mask)
-			  {
-			    /* Pass all ones value to 'mask' argument of
-			       MASK_LEN_LOAD if final_mask is invalid.  */
-			    mask_vectype = truth_type_for (vectype);
-			    final_mask = build_minus_one_cst (mask_vectype);
-			  }
+			/* Pass VF value to 'len' argument of
+			   MASK_LEN_LOAD if LOOP_LENS is invalid.  */
+			final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
 		      }
-		    if (final_len)
+		    if (!final_mask)
 		      {
-			signed char biasval
-			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-
-			bias = build_int_cst (intQI_type_node, biasval);
+			/* Pass all ones value to 'mask' argument of
+			   MASK_LEN_LOAD if final_mask is invalid.  */
+			mask_vectype = truth_type_for (vectype);
+			final_mask = build_minus_one_cst (mask_vectype);
 		      }
+		  }
+		if (final_len)
+		  {
+		    signed char biasval
+		      = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);

-		    if (final_len)
+		    bias = build_int_cst (intQI_type_node, biasval);
+		  }
+
+		if (final_len)
+		  {
+		    tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
+		    gcall *call;
+		    if (partial_ifn == IFN_MASK_LEN_LOAD)
+		      call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
+							 dataref_ptr, ptr,
+							 final_mask, final_len,
+							 bias);
+		    else
+		      call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
+							 dataref_ptr, ptr,
+							 final_len, bias);
+		    gimple_call_set_nothrow (call, true);
+		    new_stmt = call;
+		    data_ref = NULL_TREE;
+
+		    /* Need conversion if it's wrapped with VnQI.  */
+		    if (vmode != new_vmode)
 		      {
-			tree ptr
-			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
-			gcall *call;
-			if (partial_ifn == IFN_MASK_LEN_LOAD)
-			  call = gimple_build_call_internal (IFN_MASK_LEN_LOAD,
-							     5, dataref_ptr,
-							     ptr, final_mask,
-							     final_len, bias);
-			else
-			  call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
-							     dataref_ptr, ptr,
-							     final_len, bias);
-			gimple_call_set_nothrow (call, true);
-			new_stmt = call;
-			data_ref = NULL_TREE;
-
-			/* Need conversion if it's wrapped with VnQI.  */
-			if (vmode != new_vmode)
-			  {
-			    tree new_vtype = build_vector_type_for_mode (
-			      unsigned_intQI_type_node, new_vmode);
-			    tree var = vect_get_new_ssa_name (new_vtype,
-							      vect_simple_var);
-			    gimple_set_lhs (call, var);
-			    vect_finish_stmt_generation (vinfo, stmt_info, call,
-							 gsi);
-			    tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
-			    new_stmt
-			      = gimple_build_assign (vec_dest,
-						     VIEW_CONVERT_EXPR, op);
-			  }
+			tree new_vtype = build_vector_type_for_mode (
+			  unsigned_intQI_type_node, new_vmode);
+			tree var
+			  = vect_get_new_ssa_name (new_vtype, vect_simple_var);
+			gimple_set_lhs (call, var);
+			vect_finish_stmt_generation (vinfo, stmt_info, call,
+						     gsi);
+			tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
+			new_stmt = gimple_build_assign (vec_dest,
+							VIEW_CONVERT_EXPR, op);
 		      }
-		    else if (final_mask)
+		  }
+		else if (final_mask)
+		  {
+		    tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
+		    gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
+							      dataref_ptr, ptr,
+							      final_mask);
+		    gimple_call_set_nothrow (call, true);
+		    new_stmt = call;
+		    data_ref = NULL_TREE;
+		  }
+		else
+		  {
+		    tree ltype = vectype;
+		    tree new_vtype = NULL_TREE;
+		    unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
+		    unsigned int vect_align
+		      = vect_known_alignment_in_bytes (first_dr_info, vectype);
+		    unsigned int scalar_dr_size
+		      = vect_get_scalar_dr_size (first_dr_info);
+		    /* If there's no peeling for gaps but we have a gap
+		       with slp loads then load the lower half of the
+		       vector only.  See get_group_load_store_type for
+		       when we apply this optimization.  */
+		    if (slp
+			&& loop_vinfo
+			&& !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
+			&& known_eq (nunits, (group_size - gap) * 2)
+			&& known_eq (nunits, group_size)
+			&& gap >= (vect_align / scalar_dr_size))
 		      {
-			tree ptr = build_int_cst (ref_type,
-						  align * BITS_PER_UNIT);
-			gcall *call
-			  = gimple_build_call_internal (IFN_MASK_LOAD, 3,
-							dataref_ptr, ptr,
-							final_mask);
-			gimple_call_set_nothrow (call, true);
-			new_stmt = call;
-			data_ref = NULL_TREE;
+			tree half_vtype;
+			new_vtype
+			  = vector_vector_composition_type (vectype, 2,
+							    &half_vtype);
+			if (new_vtype != NULL_TREE)
+			  ltype = half_vtype;
 		      }
+		    tree offset
+		      = (dataref_offset ? dataref_offset
+					: build_int_cst (ref_type, 0));
+		    if (ltype != vectype
+			&& memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+		      {
+			unsigned HOST_WIDE_INT gap_offset
+			  = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
+			tree gapcst = build_int_cst (ref_type, gap_offset);
+			offset = size_binop (PLUS_EXPR, offset, gapcst);
+		      }
+		    data_ref
+		      = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
+		    if (alignment_support_scheme == dr_aligned)
+		      ;
 		    else
+		      TREE_TYPE (data_ref)
+			= build_aligned_type (TREE_TYPE (data_ref),
+					      align * BITS_PER_UNIT);
+		    if (ltype != vectype)
 		      {
-			tree ltype = vectype;
-			tree new_vtype = NULL_TREE;
-			unsigned HOST_WIDE_INT gap
-			  = DR_GROUP_GAP (first_stmt_info);
-			unsigned int vect_align
-			  = vect_known_alignment_in_bytes (first_dr_info,
-							   vectype);
-			unsigned int scalar_dr_size
-			  = vect_get_scalar_dr_size (first_dr_info);
-			/* If there's no peeling for gaps but we have a gap
-			   with slp loads then load the lower half of the
-			   vector only.  See get_group_load_store_type for
-			   when we apply this optimization.  */
-			if (slp
-			    && loop_vinfo
-			    && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-			    && gap != 0
-			    && known_eq (nunits, (group_size - gap) * 2)
-			    && known_eq (nunits, group_size)
-			    && gap >= (vect_align / scalar_dr_size))
+			vect_copy_ref_info (data_ref,
+					    DR_REF (first_dr_info->dr));
+			tree tem = make_ssa_name (ltype);
+			new_stmt = gimple_build_assign (tem, data_ref);
+			vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
+						     gsi);
+			data_ref = NULL;
+			vec<constructor_elt, va_gc> *v;
+			vec_alloc (v, 2);
+			if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 			  {
-			    tree half_vtype;
-			    new_vtype
-			      = vector_vector_composition_type (vectype, 2,
-								&half_vtype);
-			    if (new_vtype != NULL_TREE)
-			      ltype = half_vtype;
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						    build_zero_cst (ltype));
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
 			  }
-			tree offset
-			  = (dataref_offset ? dataref_offset
-					    : build_int_cst (ref_type, 0));
-			if (ltype != vectype
-			    && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+			else
 			  {
-			    unsigned HOST_WIDE_INT gap_offset
-			      = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
-			    tree gapcst = build_int_cst (ref_type, gap_offset);
-			    offset = size_binop (PLUS_EXPR, offset, gapcst);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						    build_zero_cst (ltype));
 			  }
-			data_ref
-			  = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
-			if (alignment_support_scheme == dr_aligned)
-			  ;
+			gcc_assert (new_vtype != NULL_TREE);
+			if (new_vtype == vectype)
+			  new_stmt = gimple_build_assign (
+			    vec_dest, build_constructor (vectype, v));
 			else
-			  TREE_TYPE (data_ref)
-			    = build_aligned_type (TREE_TYPE (data_ref),
-						  align * BITS_PER_UNIT);
-			if (ltype != vectype)
 			  {
-			    vect_copy_ref_info (data_ref,
-						DR_REF (first_dr_info->dr));
-			    tree tem = make_ssa_name (ltype);
-			    new_stmt = gimple_build_assign (tem, data_ref);
+			    tree new_vname = make_ssa_name (new_vtype);
+			    new_stmt = gimple_build_assign (
+			      new_vname, build_constructor (new_vtype, v));
 			    vect_finish_stmt_generation (vinfo, stmt_info,
 							 new_stmt, gsi);
-			    data_ref = NULL;
-			    vec<constructor_elt, va_gc> *v;
-			    vec_alloc (v, 2);
-			    if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
-			      {
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-							build_zero_cst (ltype));
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-			      }
-			    else
-			      {
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-							build_zero_cst (ltype));
-			      }
-			    gcc_assert (new_vtype != NULL_TREE);
-			    if (new_vtype == vectype)
-			      new_stmt = gimple_build_assign (
-				vec_dest, build_constructor (vectype, v));
-			    else
-			      {
-				tree new_vname = make_ssa_name (new_vtype);
-				new_stmt = gimple_build_assign (
-				  new_vname, build_constructor (new_vtype, v));
-				vect_finish_stmt_generation (vinfo, stmt_info,
-							     new_stmt, gsi);
-				new_stmt = gimple_build_assign (
-				  vec_dest, build1 (VIEW_CONVERT_EXPR, vectype,
-						    new_vname));
-			      }
+			    new_stmt = gimple_build_assign (
+			      vec_dest,
+			      build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
 			  }
 		      }
-		    break;
 		  }
-		case dr_explicit_realign:
-		  {
-		    if (costing_p)
-		      break;
-		    tree ptr, bump;
-
-		    tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+		break;
+	      }
+	    case dr_explicit_realign:
+	      {
+		if (costing_p)
+		  break;
+		tree ptr, bump;

-		    if (compute_in_loop)
-		      msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
-						    &realignment_token,
-						    dr_explicit_realign,
-						    dataref_ptr, NULL);
+		tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));

-		    if (TREE_CODE (dataref_ptr) == SSA_NAME)
-		      ptr = copy_ssa_name (dataref_ptr);
-		    else
-		      ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
-		    // For explicit realign the target alignment should be
-		    // known at compile time.
-		    unsigned HOST_WIDE_INT align =
-		      DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
-		    new_stmt = gimple_build_assign
-				 (ptr, BIT_AND_EXPR, dataref_ptr,
-				  build_int_cst
-				  (TREE_TYPE (dataref_ptr),
-				   -(HOST_WIDE_INT) align));
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    data_ref
-		      = build2 (MEM_REF, vectype, ptr,
-				build_int_cst (ref_type, 0));
-		    vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
-		    vec_dest = vect_create_destination_var (scalar_dest,
-							    vectype);
-		    new_stmt = gimple_build_assign (vec_dest, data_ref);
-		    new_temp = make_ssa_name (vec_dest, new_stmt);
-		    gimple_assign_set_lhs (new_stmt, new_temp);
-		    gimple_move_vops (new_stmt, stmt_info->stmt);
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    msq = new_temp;
-
-		    bump = size_binop (MULT_EXPR, vs,
-				       TYPE_SIZE_UNIT (elem_type));
-		    bump = size_binop (MINUS_EXPR, bump, size_one_node);
-		    ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi,
-					   stmt_info, bump);
-		    new_stmt = gimple_build_assign
-				 (NULL_TREE, BIT_AND_EXPR, ptr,
-				  build_int_cst
-				  (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
-		    if (TREE_CODE (ptr) == SSA_NAME)
-		      ptr = copy_ssa_name (ptr, new_stmt);
-		    else
-		      ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
-		    gimple_assign_set_lhs (new_stmt, ptr);
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    data_ref
-		      = build2 (MEM_REF, vectype, ptr,
-				build_int_cst (ref_type, 0));
-		    break;
-		  }
-		case dr_explicit_realign_optimized:
-		  {
-		    if (costing_p)
-		      break;
-		    if (TREE_CODE (dataref_ptr) == SSA_NAME)
-		      new_temp = copy_ssa_name (dataref_ptr);
-		    else
-		      new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
-		    // We should only be doing this if we know the target
-		    // alignment at compile time.
-		    unsigned HOST_WIDE_INT align =
-		      DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
-		    new_stmt = gimple_build_assign
-		      (new_temp, BIT_AND_EXPR, dataref_ptr,
-		       build_int_cst (TREE_TYPE (dataref_ptr),
-				     -(HOST_WIDE_INT) align));
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    data_ref
-		      = build2 (MEM_REF, vectype, new_temp,
-				build_int_cst (ref_type, 0));
-		    break;
-		  }
-		default:
-		  gcc_unreachable ();
-		}
+		if (compute_in_loop)
+		  msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
+						&realignment_token,
+						dr_explicit_realign,
+						dataref_ptr, NULL);
+
+		if (TREE_CODE (dataref_ptr) == SSA_NAME)
+		  ptr = copy_ssa_name (dataref_ptr);
+		else
+		  ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
+		// For explicit realign the target alignment should be
+		// known at compile time.
+		unsigned HOST_WIDE_INT align
+		  = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
+		new_stmt = gimple_build_assign (
+		  ptr, BIT_AND_EXPR, dataref_ptr,
+		  build_int_cst (TREE_TYPE (dataref_ptr),
+				 -(HOST_WIDE_INT) align));
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		data_ref
+		  = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
+		vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+		vec_dest = vect_create_destination_var (scalar_dest, vectype);
+		new_stmt = gimple_build_assign (vec_dest, data_ref);
+		new_temp = make_ssa_name (vec_dest, new_stmt);
+		gimple_assign_set_lhs (new_stmt, new_temp);
+		gimple_move_vops (new_stmt, stmt_info->stmt);
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		msq = new_temp;
+
+		bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
+		bump = size_binop (MINUS_EXPR, bump, size_one_node);
+		ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
+				       bump);
+		new_stmt = gimple_build_assign (
+		  NULL_TREE, BIT_AND_EXPR, ptr,
+		  build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
+		if (TREE_CODE (ptr) == SSA_NAME)
+		  ptr = copy_ssa_name (ptr, new_stmt);
+		else
+		  ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
+		gimple_assign_set_lhs (new_stmt, ptr);
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		data_ref
+		  = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
+		break;
+	      }
+	    case dr_explicit_realign_optimized:
+	      {
+		if (costing_p)
+		  break;
+		if (TREE_CODE (dataref_ptr) == SSA_NAME)
+		  new_temp = copy_ssa_name (dataref_ptr);
+		else
+		  new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
+		// We should only be doing this if we know the target
+		// alignment at compile time.
+		unsigned HOST_WIDE_INT align
+		  = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
+		new_stmt = gimple_build_assign (
+		  new_temp, BIT_AND_EXPR, dataref_ptr,
+		  build_int_cst (TREE_TYPE (dataref_ptr),
+				 -(HOST_WIDE_INT) align));
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		data_ref = build2 (MEM_REF, vectype, new_temp,
+				   build_int_cst (ref_type, 0));
+		break;
+	      }
+	    default:
+	      gcc_unreachable ();
+	    }

-	      /* One common place to cost the above vect load for different
-		 alignment support schemes.  */
-	      if (costing_p)
-		{
-		  /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
-		     only need to take care of the first stmt, whose
-		     stmt_info is first_stmt_info, vec_num iterating on it
-		     will cover the cost for the remaining, it's consistent
-		     with transforming.  For the prologue cost for realign,
-		     we only need to count it once for the whole group.  */
-		  bool first_stmt_info_p = first_stmt_info == stmt_info;
-		  bool add_realign_cost = first_stmt_info_p && i == 0;
-		  if (memory_access_type == VMAT_CONTIGUOUS
-		      || memory_access_type == VMAT_CONTIGUOUS_REVERSE
-		      || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
-			  && (!grouped_load || first_stmt_info_p)))
-		    vect_get_load_cost (vinfo, stmt_info, 1,
-					alignment_support_scheme, misalignment,
-					add_realign_cost, &inside_cost,
-					&prologue_cost, cost_vec, cost_vec,
-					true);
-		}
-	      else
+	  /* One common place to cost the above vect load for different
+	     alignment support schemes.  */
+	  if (costing_p)
+	    {
+	      /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
+		 only need to take care of the first stmt, whose
+		 stmt_info is first_stmt_info, vec_num iterating on it
+		 will cover the cost for the remaining, it's consistent
+		 with transforming.  For the prologue cost for realign,
+		 we only need to count it once for the whole group.  */
+	      bool first_stmt_info_p = first_stmt_info == stmt_info;
+	      bool add_realign_cost = first_stmt_info_p && i == 0;
+	      if (memory_access_type == VMAT_CONTIGUOUS
+		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
+		  || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
+		      && (!grouped_load || first_stmt_info_p)))
+		vect_get_load_cost (vinfo, stmt_info, 1,
+				    alignment_support_scheme, misalignment,
+				    add_realign_cost, &inside_cost,
+				    &prologue_cost, cost_vec, cost_vec, true);
+	    }
+	  else
+	    {
+	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	      /* DATA_REF is null if we've already built the statement.  */
+	      if (data_ref)
 		{
-		  vec_dest = vect_create_destination_var (scalar_dest, vectype);
-		  /* DATA_REF is null if we've already built the statement.  */
-		  if (data_ref)
-		    {
-		      vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
-		      new_stmt = gimple_build_assign (vec_dest, data_ref);
-		    }
-		  new_temp = make_ssa_name (vec_dest, new_stmt);
-		  gimple_set_lhs (new_stmt, new_temp);
-		  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		  vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+		  new_stmt = gimple_build_assign (vec_dest, data_ref);
 		}
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+	    }

-	      /* 3. Handle explicit realignment if necessary/supported.
-		 Create in loop:
-		   vec_dest = realign_load (msq, lsq, realignment_token)  */
-	      if (!costing_p
-		  && (alignment_support_scheme == dr_explicit_realign_optimized
-		      || alignment_support_scheme == dr_explicit_realign))
-		{
-		  lsq = gimple_assign_lhs (new_stmt);
-		  if (!realignment_token)
-		    realignment_token = dataref_ptr;
-		  vec_dest = vect_create_destination_var (scalar_dest, vectype);
-		  new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR,
-						  msq, lsq, realignment_token);
-		  new_temp = make_ssa_name (vec_dest, new_stmt);
-		  gimple_assign_set_lhs (new_stmt, new_temp);
-		  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+	  /* 3. Handle explicit realignment if necessary/supported.
+	     Create in loop:
+	       vec_dest = realign_load (msq, lsq, realignment_token)  */
+	  if (!costing_p
+	      && (alignment_support_scheme == dr_explicit_realign_optimized
+		  || alignment_support_scheme == dr_explicit_realign))
+	    {
+	      lsq = gimple_assign_lhs (new_stmt);
+	      if (!realignment_token)
+		realignment_token = dataref_ptr;
+	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	      new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
+					      lsq, realignment_token);
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_assign_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);

-		  if (alignment_support_scheme == dr_explicit_realign_optimized)
-		    {
-		      gcc_assert (phi);
-		      if (i == vec_num - 1 && j == ncopies - 1)
-			add_phi_arg (phi, lsq,
-				     loop_latch_edge (containing_loop),
-				     UNKNOWN_LOCATION);
-		      msq = lsq;
-		    }
+	      if (alignment_support_scheme == dr_explicit_realign_optimized)
+		{
+		  gcc_assert (phi);
+		  if (i == vec_num - 1 && j == ncopies - 1)
+		    add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
+				 UNKNOWN_LOCATION);
+		  msq = lsq;
 		}
+	    }

-	      if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+	  if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+	    {
+	      if (costing_p)
+		inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
+						stmt_info, 0, vect_body);
+	      else
 		{
-		  if (costing_p)
-		    inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
-						    stmt_info, 0, vect_body);
-		  else
-		    {
-		      tree perm_mask = perm_mask_for_reverse (vectype);
-		      new_temp
-			= permute_vec_elements (vinfo, new_temp, new_temp,
-						perm_mask, stmt_info, gsi);
-		      new_stmt = SSA_NAME_DEF_STMT (new_temp);
-		    }
+		  tree perm_mask = perm_mask_for_reverse (vectype);
+		  new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
+						   perm_mask, stmt_info, gsi);
+		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
 		}
+	    }

-	      /* Collect vector loads and later create their permutation in
-		 vect_transform_grouped_load ().  */
-	      if (!costing_p && (grouped_load || slp_perm))
-		dr_chain.quick_push (new_temp);
+	  /* Collect vector loads and later create their permutation in
+	     vect_transform_grouped_load ().  */
+	  if (!costing_p && (grouped_load || slp_perm))
+	    dr_chain.quick_push (new_temp);

-	      /* Store vector loads in the corresponding SLP_NODE.  */
-	      if (!costing_p && slp && !slp_perm)
-		slp_node->push_vec_def (new_stmt);
+	  /* Store vector loads in the corresponding SLP_NODE.  */
+	  if (!costing_p && slp && !slp_perm)
+	    slp_node->push_vec_def (new_stmt);

-	      /* With SLP permutation we load the gaps as well, without
-	         we need to skip the gaps after we manage to fully load
-		 all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
-	      group_elt += nunits;
-	      if (!costing_p
-		  && maybe_ne (group_gap_adj, 0U)
-		  && !slp_perm
-		  && known_eq (group_elt, group_size - group_gap_adj))
-		{
-		  poly_wide_int bump_val
-		    = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
-		       * group_gap_adj);
-		  if (tree_int_cst_sgn
-			(vect_dr_behavior (vinfo, dr_info)->step) == -1)
-		    bump_val = -bump_val;
-		  tree bump = wide_int_to_tree (sizetype, bump_val);
-		  dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
-						 gsi, stmt_info, bump);
-		  group_elt = 0;
-		}
-	    }
-	  /* Bump the vector pointer to account for a gap or for excess
-	     elements loaded for a permuted SLP load.  */
+	  /* With SLP permutation we load the gaps as well, without
+	     we need to skip the gaps after we manage to fully load
+	     all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
+	  group_elt += nunits;
 	  if (!costing_p
 	      && maybe_ne (group_gap_adj, 0U)
-	      && slp_perm)
+	      && !slp_perm
+	      && known_eq (group_elt, group_size - group_gap_adj))
 	    {
 	      poly_wide_int bump_val
-		= (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
-		   * group_gap_adj);
-	      if (tree_int_cst_sgn
-		    (vect_dr_behavior (vinfo, dr_info)->step) == -1)
+		= (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
+	      if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
+		  == -1)
 		bump_val = -bump_val;
 	      tree bump = wide_int_to_tree (sizetype, bump_val);
 	      dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
 					     stmt_info, bump);
+	      group_elt = 0;
 	    }
 	}
+      /* Bump the vector pointer to account for a gap or for excess
+	 elements loaded for a permuted SLP load.  */
+      if (!costing_p
+	  && maybe_ne (group_gap_adj, 0U)
+	  && slp_perm)
+	{
+	  poly_wide_int bump_val
+	    = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
+	  if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
+	    bump_val = -bump_val;
+	  tree bump = wide_int_to_tree (sizetype, bump_val);
+	  dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+					 stmt_info, bump);
+	}

       if (slp && !slp_perm)
 	continue;
@@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo,
 	    }
 	}
       else
-        {
-          if (grouped_load)
-  	    {
-	      if (memory_access_type != VMAT_LOAD_STORE_LANES)
+	{
+	  if (grouped_load)
+	    {
+	      gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
+	      /* We assume that the cost of a single load-lanes instruction
+		 is equivalent to the cost of DR_GROUP_SIZE separate loads.
+		 If a grouped access is instead being provided by a
+		 load-and-permute operation, include the cost of the
+		 permutes.  */
+	      if (costing_p && first_stmt_info == stmt_info)
 		{
-		  gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
-		  /* We assume that the cost of a single load-lanes instruction
-		     is equivalent to the cost of DR_GROUP_SIZE separate loads.
-		     If a grouped access is instead being provided by a
-		     load-and-permute operation, include the cost of the
-		     permutes.  */
-		  if (costing_p && first_stmt_info == stmt_info)
-		    {
-		      /* Uses an even and odd extract operations or shuffle
-			 operations for each needed permute.  */
-		      int group_size = DR_GROUP_SIZE (first_stmt_info);
-		      int nstmts = ceil_log2 (group_size) * group_size;
-		      inside_cost
-			+= record_stmt_cost (cost_vec, nstmts, vec_perm,
-					     stmt_info, 0, vect_body);
+		  /* Uses an even and odd extract operations or shuffle
+		     operations for each needed permute.  */
+		  int group_size = DR_GROUP_SIZE (first_stmt_info);
+		  int nstmts = ceil_log2 (group_size) * group_size;
+		  inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
+						   stmt_info, 0, vect_body);

-		      if (dump_enabled_p ())
-			dump_printf_loc (
-			  MSG_NOTE, vect_location,
-			  "vect_model_load_cost: strided group_size = %d .\n",
-			  group_size);
-		    }
-		  else if (!costing_p)
-		    vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
-						 group_size, gsi);
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "vect_model_load_cost:"
+				     "strided group_size = %d .\n",
+				     group_size);
+		}
+	      else if (!costing_p)
+		{
+		  vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
+					       group_size, gsi);
+		  *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 		}
-	      if (!costing_p)
-		*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 	    }
 	  else if (!costing_p)
 	    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
@@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo,
     {
       gcc_assert (memory_access_type != VMAT_INVARIANT
 		  && memory_access_type != VMAT_ELEMENTWISE
-		  && memory_access_type != VMAT_STRIDED_SLP);
+		  && memory_access_type != VMAT_STRIDED_SLP
+		  && memory_access_type != VMAT_LOAD_STORE_LANES);
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "vect_model_load_cost: inside_cost = %u, "
--
2.31.1

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2023-08-17  7:48 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-14  8:54 [PATCH] vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest Kewen.Lin
2023-08-14  8:59 ` [PATCH] vect: Move VMAT_GATHER_SCATTER " Kewen.Lin
2023-08-14 12:20   ` Richard Sandiford
2023-08-14 13:01     ` Kewen.Lin
2023-08-14 14:16       ` Richard Sandiford
2023-08-15  2:33         ` Kewen.Lin
2023-08-15  7:53           ` Richard Biener
2023-08-15  8:44             ` Richard Sandiford
2023-08-15  9:04               ` Richard Biener
2023-08-15  9:13                 ` Richard Sandiford
2023-08-16  2:38                   ` Kewen.Lin
2023-08-17  7:47                     ` Richard Biener
2023-08-15 11:47             ` Kewen.Lin
2023-08-15 12:07               ` Richard Biener
2023-08-15 12:13                 ` Kewen.Lin
2023-08-14 12:04 ` [PATCH] vect: Move VMAT_LOAD_STORE_LANES " Richard Biener
2023-08-14 12:49   ` Kewen.Lin
2023-08-14 13:47     ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).