From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linkw@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2063)
	id 343293858C2A; Mon, 23 Oct 2023 02:24:31 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 343293858C2A
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1698027871;
	bh=gM+wcvRaxzJhZFlr3YhZUa3REB/yGWt+/0157coNOyw=;
	h=From:To:Subject:Date:From;
	b=ehdOAgF3SdLiJt5xQIKQd9Om+TbQnQmGm/KUtc2ApRnpali/fDWh0Ck7VzzrQkio3
	 YBk+C9SFqLwxlJraTd2Lb85Sdmm+2CVTO/ENyGCtVnMk7JWFQM0KPEUvg6n5Ka64MH
	 O60Hjg10fICshCPzuR8LVpYEe9QDNsjbtmxAuxao=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Kewen Lin <linkw@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-4842] vect: Cost adjacent vector loads/stores together
 [PR111784]
X-Act-Checkin: gcc
X-Git-Author: Kewen Lin <linkw@linux.ibm.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 1df490edd48042b07aa780b088148a9118cbcb46
X-Git-Newrev: 1908775f7982bd2de36df5d94396eca0865bad9a
Message-Id: <20231023022431.343293858C2A@sourceware.org>
Date: Mon, 23 Oct 2023 02:24:31 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:1908775f7982bd2de36df5d94396eca0865bad9a

commit r14-4842-g1908775f7982bd2de36df5d94396eca0865bad9a
Author: Kewen Lin <linkw@linux.ibm.com>
Date:   Sun Oct 22 21:18:40 2023 -0500

    vect: Cost adjacent vector loads/stores together [PR111784]
    
    As comments[1][2], this patch is to change the costing way
    on some adjacent vector loads/stores from costing one by
    one to costing them together with the total number once.
    
    It helps to fix the exposed regression PR111784 on aarch64,
    as aarch64 specific costing could make different decisions
    according to the different costing ways (counting with total
    number vs. counting one by one).  Based on a reduced test
    case from PR111784, only considering vec_num can fix the
    regression already, but vector loads/stores in regard to
    ncopies are also adjacent accesses, so they are considered
    as well.
    
    btw, this patch leaves the costing on dr_explicit_realign
    and dr_explicit_realign_optimized alone to make it simple.
    The costing way change can cause the differences for them
    since there is one costing depending on targetm.vectorize.
    builtin_mask_for_load and it's costed according to the
    calling times.  IIUC, these two dr_alignment_support are
    mainly used for old Power? (only having 16 bytes aligned
    vector load/store but no unaligned vector load/store).
    
    [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-September/630742.html
    [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-September/630744.html
    
            PR tree-optimization/111784
    
    gcc/ChangeLog:
    
            * tree-vect-stmts.cc (vectorizable_store): Adjust costing way for
            adjacent vector stores, by costing them with the total number
            rather than costing them one by one.
            (vectorizable_load): Adjust costing way for adjacent vector
            loads, by costing them with the total number rather than costing
            them one by one.

Diff:
---
 gcc/tree-vect-stmts.cc | 137 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 95 insertions(+), 42 deletions(-)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 337b24c51f95..99ba75e98c0d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8626,6 +8626,9 @@ vectorizable_store (vec_info *vinfo,
       alias_off = build_int_cst (ref_type, 0);
       stmt_vec_info next_stmt_info = first_stmt_info;
       auto_vec<tree> vec_oprnds (ncopies);
+      /* For costing some adjacent vector stores, we'd like to cost with
+	 the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_stores = 0;
       for (g = 0; g < group_size; g++)
 	{
 	  running_off = offvar;
@@ -8683,10 +8686,7 @@ vectorizable_store (vec_info *vinfo,
 			 store to avoid ICE like 110776.  */
 		      if (VECTOR_TYPE_P (ltype)
 			  && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
-			vect_get_store_cost (vinfo, stmt_info, 1,
-					     alignment_support_scheme,
-					     misalignment, &inside_cost,
-					     cost_vec);
+			n_adjacent_stores++;
 		      else
 			inside_cost
 			  += record_stmt_cost (cost_vec, 1, scalar_store,
@@ -8743,11 +8743,18 @@ vectorizable_store (vec_info *vinfo,
 	    break;
 	}
 
-      if (costing_p && dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "vect_model_store_cost: inside_cost = %d, "
-			 "prologue_cost = %d .\n",
-			 inside_cost, prologue_cost);
+      if (costing_p)
+	{
+	  if (n_adjacent_stores > 0)
+	    vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+				 alignment_support_scheme, misalignment,
+				 &inside_cost, cost_vec);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "vect_model_store_cost: inside_cost = %d, "
+			     "prologue_cost = %d .\n",
+			     inside_cost, prologue_cost);
+	}
 
       return true;
     }
@@ -8854,6 +8861,9 @@ vectorizable_store (vec_info *vinfo,
     {
       gcc_assert (!slp && grouped_store);
       unsigned inside_cost = 0, prologue_cost = 0;
+      /* For costing some adjacent vector stores, we'd like to cost with
+	 the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_stores = 0;
       for (j = 0; j < ncopies; j++)
 	{
 	  gimple *new_stmt;
@@ -8919,10 +8929,7 @@ vectorizable_store (vec_info *vinfo,
 
 	  if (costing_p)
 	    {
-	      for (i = 0; i < vec_num; i++)
-		vect_get_store_cost (vinfo, stmt_info, 1,
-				     alignment_support_scheme, misalignment,
-				     &inside_cost, cost_vec);
+	      n_adjacent_stores += vec_num;
 	      continue;
 	    }
 
@@ -9012,11 +9019,18 @@ vectorizable_store (vec_info *vinfo,
 	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
 	}
 
-      if (costing_p && dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "vect_model_store_cost: inside_cost = %d, "
-			 "prologue_cost = %d .\n",
-			 inside_cost, prologue_cost);
+      if (costing_p)
+	{
+	  if (n_adjacent_stores > 0)
+	    vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+				 alignment_support_scheme, misalignment,
+				 &inside_cost, cost_vec);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "vect_model_store_cost: inside_cost = %d, "
+			     "prologue_cost = %d .\n",
+			     inside_cost, prologue_cost);
+	}
 
       return true;
     }
@@ -9235,6 +9249,9 @@ vectorizable_store (vec_info *vinfo,
 	      || memory_access_type == VMAT_CONTIGUOUS_REVERSE);
 
   unsigned inside_cost = 0, prologue_cost = 0;
+  /* For costing some adjacent vector stores, we'd like to cost with
+     the total number of them once instead of cost each one by one. */
+  unsigned int n_adjacent_stores = 0;
   auto_vec<tree> result_chain (group_size);
   auto_vec<tree, 1> vec_oprnds;
   for (j = 0; j < ncopies; j++)
@@ -9396,9 +9413,7 @@ vectorizable_store (vec_info *vinfo,
 
 	  if (costing_p)
 	    {
-	      vect_get_store_cost (vinfo, stmt_info, 1,
-				   alignment_support_scheme, misalignment,
-				   &inside_cost, cost_vec);
+	      n_adjacent_stores++;
 
 	      if (!slp)
 		{
@@ -9568,6 +9583,11 @@ vectorizable_store (vec_info *vinfo,
 
   if (costing_p)
     {
+      if (n_adjacent_stores > 0)
+	vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+			     alignment_support_scheme, misalignment,
+			     &inside_cost, cost_vec);
+
       /* When vectorizing a store into the function result assign
 	 a penalty if the function returns in a multi-register location.
 	 In this case we assume we'll end up with having to spill the
@@ -10275,6 +10295,9 @@ vectorizable_load (vec_info *vinfo,
       unsigned HOST_WIDE_INT
 	elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
       unsigned int n_groups = 0;
+      /* For costing some adjacent vector loads, we'd like to cost with
+	 the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_loads = 0;
       for (j = 0; j < ncopies; j++)
 	{
 	  if (nloads > 1 && !costing_p)
@@ -10288,10 +10311,7 @@ vectorizable_load (vec_info *vinfo,
 		     avoid ICE, see PR110776.  */
 		  if (VECTOR_TYPE_P (ltype)
 		      && memory_access_type != VMAT_ELEMENTWISE)
-		    vect_get_load_cost (vinfo, stmt_info, 1,
-					alignment_support_scheme, misalignment,
-					false, &inside_cost, nullptr, cost_vec,
-					cost_vec, true);
+		    n_adjacent_loads++;
 		  else
 		    inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
 						     stmt_info, 0, vect_body);
@@ -10385,11 +10405,19 @@ vectorizable_load (vec_info *vinfo,
 					  false, &n_perms);
 	}
 
-      if (costing_p && dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "vect_model_load_cost: inside_cost = %u, "
-			 "prologue_cost = 0 .\n",
-			 inside_cost);
+      if (costing_p)
+	{
+	  if (n_adjacent_loads > 0)
+	    vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+				alignment_support_scheme, misalignment, false,
+				&inside_cost, nullptr, cost_vec, cost_vec,
+				true);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "vect_model_load_cost: inside_cost = %u, "
+			     "prologue_cost = 0 .\n",
+			     inside_cost);
+	}
 
       return true;
     }
@@ -10694,6 +10722,9 @@ vectorizable_load (vec_info *vinfo,
       gcc_assert (grouped_load && !slp);
 
       unsigned int inside_cost = 0, prologue_cost = 0;
+      /* For costing some adjacent vector loads, we'd like to cost with
+	 the total number of them once instead of cost each one by one. */
+      unsigned int n_adjacent_loads = 0;
       for (j = 0; j < ncopies; j++)
 	{
 	  if (costing_p)
@@ -10725,9 +10756,7 @@ vectorizable_load (vec_info *vinfo,
 					  true);
 		    }
 		}
-	      vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
-				  misalignment, false, &inside_cost,
-				  &prologue_cost, cost_vec, cost_vec, true);
+	      n_adjacent_loads++;
 	      continue;
 	    }
 
@@ -10829,11 +10858,19 @@ vectorizable_load (vec_info *vinfo,
 	  *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 	}
 
-      if (costing_p && dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "vect_model_load_cost: inside_cost = %u, "
-			 "prologue_cost = %u .\n",
-			 inside_cost, prologue_cost);
+      if (costing_p)
+	{
+	  if (n_adjacent_loads > 0)
+	    vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+				alignment_support_scheme, misalignment, false,
+				&inside_cost, &prologue_cost, cost_vec,
+				cost_vec, true);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "vect_model_load_cost: inside_cost = %u, "
+			     "prologue_cost = %u .\n",
+			     inside_cost, prologue_cost);
+	}
 
       return true;
     }
@@ -11177,6 +11214,9 @@ vectorizable_load (vec_info *vinfo,
 
   poly_uint64 group_elt = 0;
   unsigned int inside_cost = 0, prologue_cost = 0;
+  /* For costing some adjacent vector loads, we'd like to cost with
+     the total number of them once instead of cost each one by one. */
+  unsigned int n_adjacent_loads = 0;
   for (j = 0; j < ncopies; j++)
     {
       /* 1. Create the vector or array pointer update chain.  */
@@ -11571,10 +11611,18 @@ vectorizable_load (vec_info *vinfo,
 		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
 		  || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
 		      && (!grouped_load || first_stmt_info_p)))
-		vect_get_load_cost (vinfo, stmt_info, 1,
-				    alignment_support_scheme, misalignment,
-				    add_realign_cost, &inside_cost,
-				    &prologue_cost, cost_vec, cost_vec, true);
+		{
+		  /* Leave realign cases alone to keep them simple.  */
+		  if (alignment_support_scheme == dr_explicit_realign_optimized
+		      || alignment_support_scheme == dr_explicit_realign)
+		    vect_get_load_cost (vinfo, stmt_info, 1,
+					alignment_support_scheme, misalignment,
+					add_realign_cost, &inside_cost,
+					&prologue_cost, cost_vec, cost_vec,
+					true);
+		  else
+		    n_adjacent_loads++;
+		}
 	    }
 	  else
 	    {
@@ -11745,6 +11793,11 @@ vectorizable_load (vec_info *vinfo,
       gcc_assert (memory_access_type == VMAT_CONTIGUOUS
 		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
 		  || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
+      if (n_adjacent_loads > 0)
+	vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+			    alignment_support_scheme, misalignment, false,
+			    &inside_cost, &prologue_cost, cost_vec, cost_vec,
+			    true);
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "vect_model_load_cost: inside_cost = %u, "