From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <panli@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7924)
	id 699813858D38; Tue, 17 Oct 2023 01:06:26 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 699813858D38
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1697504786;
	bh=plNtK8YX9GJpjg6GBH0Y+DErL0ezje8HnJLisj4muzM=;
	h=From:To:Subject:Date:From;
	b=XMElRZeN1qb70bghnOAy+r8LwcXiPvtcttxPPOQ/ky6wAAMj1qVbyMuLdF7levL1a
	 epySwKVUbCD6BFX6E7havf0j1LutNWKz/55KkEYPFv7vXd/hp/bDZ5Xa/QCsaI5ycB
	 Y5Jw8OQlp14eA7xDlJyNnYXSXS1bcVZyY5xScIy0=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Pan Li <panli@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-4673] RISC-V: Fix unexpected big LMUL choosing in dynamic
 LMUL model for non-adjacent load/store
X-Act-Checkin: gcc
X-Git-Author: Juzhe-Zhong <juzhe.zhong@rivai.ai>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: ef8f7e3f973d90fdffaf4b8e720db484ce63df6f
X-Git-Newrev: b25b43caf2503a160a928d61318dff837efabd97
Message-Id: <20231017010626.699813858D38@sourceware.org>
Date: Tue, 17 Oct 2023 01:06:26 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:b25b43caf2503a160a928d61318dff837efabd97

commit r14-4673-gb25b43caf2503a160a928d61318dff837efabd97
Author: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Date:   Tue Oct 17 07:20:38 2023 +0800

    RISC-V: Fix unexpected big LMUL choosing in dynamic LMUL model for non-adjacent load/store
    
    Consider this following case:
    int
    bar (int *x, int a, int b, int n)
    {
      x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
      int sum1 = 0;
      int sum2 = 0;
      for (int i = 0; i < n; ++i)
        {
          sum1 += x[2*i] - a;
          sum1 += x[2*i+1] * b;
          sum2 += x[2*i] - b;
          sum2 += x[2*i+1] * a;
        }
      return sum1 + sum2;
    }
    
    Before this patch:
    
    bar:
            ble     a3,zero,.L5
            csrr    t0,vlenb
            csrr    a6,vlenb
            slli    t1,t0,3
            vsetvli a5,zero,e32,m4,ta,ma
            sub     sp,sp,t1
            vid.v   v20
            vmv.v.x v12,a1
            vand.vi v4,v20,1
            vmv.v.x v16,a2
            vmseq.vi        v4,v4,1
            slli    t3,a6,2
            vsetvli zero,a5,e32,m4,ta,ma
            vmv1r.v v0,v4
            viota.m v8,v4
            add     a7,t3,sp
            vsetvli a5,zero,e32,m4,ta,mu
            vand.vi v28,v20,-2
            vadd.vi v4,v28,1
            vs4r.v  v20,0(a7)                        -----  spill
            vrgather.vv     v24,v12,v8
            vrgather.vv     v20,v16,v8
            vrgather.vv     v24,v16,v8,v0.t
            vrgather.vv     v20,v12,v8,v0.t
            vs4r.v  v4,0(sp)                          ----- spill
            slli    a3,a3,1
            addi    t4,a6,-1
            neg     t1,a6
            vmv4r.v v0,v20
            vmv.v.i v4,0
            j       .L4
    .L13:
            vsetvli a5,zero,e32,m4,ta,ma
    .L4:
            mv      a7,a3
            mv      a4,a3
            bleu    a3,a6,.L3
            csrr    a4,vlenb
    .L3:
            vmv.v.x v8,t4
            vl4re32.v       v12,0(sp)                ---- spill
            vand.vv v20,v28,v8
            vand.vv v8,v12,v8
            vsetvli zero,a4,e32,m4,ta,ma
            vle32.v v16,0(a0)
            vsetvli a5,zero,e32,m4,ta,ma
            add     a3,a3,t1
            vrgather.vv     v12,v16,v20
            add     a0,a0,t3
            vrgather.vv     v20,v16,v8
            vsub.vv v12,v12,v0
            vsetvli zero,a4,e32,m4,tu,ma
            vadd.vv v4,v4,v12
            vmacc.vv        v4,v24,v20
            bgtu    a7,a6,.L13
            csrr    a1,vlenb
            slli    a1,a1,2
            add     a1,a1,sp
            li      a4,-1
            csrr    t0,vlenb
            vsetvli a5,zero,e32,m4,ta,ma
            vl4re32.v       v12,0(a1)               ---- spill
            vmv.v.i v8,0
            vmul.vx v0,v12,a4
            li      a2,0
            slli    t1,t0,3
            vadd.vi v0,v0,-1
            vand.vi v0,v0,1
            vmseq.vv        v0,v0,v8
            vand.vi v12,v12,1
            vmerge.vvm      v16,v8,v4,v0
            vmseq.vv        v12,v12,v8
            vmv.s.x v1,a2
            vmv1r.v v0,v12
            vredsum.vs      v16,v16,v1
            vmerge.vvm      v8,v8,v4,v0
            vmv.x.s a0,v16
            vredsum.vs      v8,v8,v1
            vmv.x.s a5,v8
            add     sp,sp,t1
            addw    a0,a0,a5
            jr      ra
    .L5:
            li      a0,0
            ret
    
    We can there are multiple horrible register spillings.
    The root cause of this issue is for a scalar IR load:
    
    _5 = *_4;
    
    We didn't check whether it is a continguous load/store or gather/scatter load/store
    
    Since it will be translate into:
    
                       1. MASK_LEN_GATHER_LOAD (..., perm indice).
                       2. Continguous load/store + VEC_PERM (..., perm indice)
    
    It's obvious that no matter which situation, we will end up with consuming one vector register group (perm indice)
    that we didn't count it before.
    
    So this case we pick LMUL = 4 which is incorrect choice for dynamic LMUL cost model.
    
    The key of this patch is:
    
              if ((type == load_vec_info_type || type == store_vec_info_type)
                  && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)))
                {
                   ...
                }
    
    Add one more register consumption if it is not an adjacent load/store.
    
    After this patch, it pick LMUL = 2 which is optimal:
    
    bar:
            ble     a3,zero,.L4
            csrr    a6,vlenb
            vsetvli a5,zero,e32,m2,ta,ma
            vmv.v.x v6,a2
            srli    a2,a6,1
            vmv.v.x v4,a1
            vid.v   v12
            slli    a3,a3,1
            vand.vi v0,v12,1
            addi    t1,a2,-1
            vmseq.vi        v0,v0,1
            slli    a6,a6,1
            vsetvli zero,a5,e32,m2,ta,ma
            neg     a7,a2
            viota.m v2,v0
            vsetvli a5,zero,e32,m2,ta,mu
            vrgather.vv     v16,v4,v2
            vrgather.vv     v14,v6,v2
            vrgather.vv     v16,v6,v2,v0.t
            vrgather.vv     v14,v4,v2,v0.t
            vand.vi v18,v12,-2
            vmv.v.i v2,0
            vadd.vi v20,v18,1
    .L3:
            minu    a4,a3,a2
            vsetvli zero,a4,e32,m2,ta,ma
            vle32.v v8,0(a0)
            vsetvli a5,zero,e32,m2,ta,ma
            vmv.v.x v4,t1
            vand.vv v10,v18,v4
            vrgather.vv     v6,v8,v10
            vsub.vv v6,v6,v14
            vsetvli zero,a4,e32,m2,tu,ma
            vadd.vv v2,v2,v6
            vsetvli a1,zero,e32,m2,ta,ma
            vand.vv v4,v20,v4
            vrgather.vv     v6,v8,v4
            vsetvli zero,a4,e32,m2,tu,ma
            mv      a4,a3
            add     a0,a0,a6
            add     a3,a3,a7
            vmacc.vv        v2,v16,v6
            bgtu    a4,a2,.L3
            vsetvli a1,zero,e32,m2,ta,ma
            vand.vi v0,v12,1
            vmv.v.i v4,0
            li      a3,-1
            vmseq.vv        v0,v0,v4
            vmv.s.x v1,zero
            vmerge.vvm      v6,v4,v2,v0
            vredsum.vs      v6,v6,v1
            vmul.vx v0,v12,a3
            vadd.vi v0,v0,-1
            vand.vi v0,v0,1
            vmv.x.s a4,v6
            vmseq.vv        v0,v0,v4
            vmv.s.x v1,zero
            vmerge.vvm      v4,v4,v2,v0
            vredsum.vs      v4,v4,v1
            vmv.x.s a0,v4
            addw    a0,a0,a4
            ret
    .L4:
            li      a0,0
            ret
    
    No spillings.
    
    gcc/ChangeLog:
    
            * config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Fix big LMUL issue.
            (get_store_value): New function.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: New test.

Diff:
---
 gcc/config/riscv/riscv-vector-costs.cc             | 93 +++++++++++++++++++---
 .../vect/costmodel/riscv/rvv/dynamic-lmul2-7.c     | 25 ++++++
 2 files changed, 109 insertions(+), 9 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 0b8903965354..33061efb1d0e 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "bitmap.h"
 #include "ssa.h"
 #include "backend.h"
+#include "tree-data-ref.h"
 
 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@@ -135,8 +136,9 @@ compute_local_program_points (
 		    || is_gimple_call (gsi_stmt (si))))
 		continue;
 	      stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
-	      if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
-		  != undef_vec_info_type)
+	      enum stmt_vec_info_type type
+		= STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+	      if (type != undef_vec_info_type)
 		{
 		  stmt_point info = {point, gsi_stmt (si)};
 		  program_points.safe_push (info);
@@ -289,9 +291,7 @@ max_number_of_live_regs (const basic_block bb,
   unsigned int i;
   unsigned int live_point = 0;
   auto_vec<unsigned int> live_vars_vec;
-  live_vars_vec.safe_grow (max_point + 1, true);
-  for (i = 0; i < live_vars_vec.length (); ++i)
-    live_vars_vec[i] = 0;
+  live_vars_vec.safe_grow_cleared (max_point + 1, true);
   for (hash_map<tree, pair>::iterator iter = live_ranges.begin ();
        iter != live_ranges.end (); ++iter)
     {
@@ -360,6 +360,31 @@ get_current_lmul (class loop *loop)
   return loop_autovec_infos.get (loop)->current_lmul;
 }
 
+/* Get STORE value.  */
+static tree
+get_store_value (gimple *stmt)
+{
+  if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
+    {
+      if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	return gimple_call_arg (stmt, 3);
+      else
+	gcc_unreachable ();
+    }
+  else
+    return gimple_assign_rhs1 (stmt);
+}
+
+/* Return true if it is non-contiguous load/store.  */
+static bool
+non_contiguous_memory_access_p (stmt_vec_info stmt_info)
+{
+  enum stmt_vec_info_type type
+    = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+  return ((type == load_vec_info_type || type == store_vec_info_type)
+	  && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)));
+}
+
 /* Update the live ranges according PHI.
 
    Loop:
@@ -395,13 +420,15 @@ update_local_live_ranges (
   unsigned int nbbs = loop->num_nodes;
   unsigned int i, j;
   gphi_iterator psi;
+  gimple_stmt_iterator si;
   for (i = 0; i < nbbs; i++)
     {
       basic_block bb = bbs[i];
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Update local program points for bb %d:\n", bb->index);
-      for (psi = gsi_start_phis (bbs[i]); !gsi_end_p (psi); gsi_next (&psi))
+			 "Update local program points for bb %d:\n",
+			 bbs[i]->index);
+      for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
 	{
 	  gphi *phi = psi.phi ();
 	  stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
@@ -413,12 +440,23 @@ update_local_live_ranges (
 	    {
 	      edge e = gimple_phi_arg_edge (phi, j);
 	      tree def = gimple_phi_arg_def (phi, j);
-	      auto *live_ranges = live_ranges_per_bb.get (e->src);
+	      auto *live_ranges = live_ranges_per_bb.get (bb);
+	      auto *live_range = live_ranges->get (def);
+	      if (live_range && flow_bb_inside_loop_p (loop, e->src))
+		{
+		  unsigned int start = (*live_range).first;
+		  (*live_range).first = 0;
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "Update %T start point from %d to %d:\n",
+				     def, start, (*live_range).first);
+		}
+	      live_ranges = live_ranges_per_bb.get (e->src);
 	      if (!program_points_per_bb.get (e->src))
 		continue;
 	      unsigned int max_point
 		= (*program_points_per_bb.get (e->src)).length () - 1;
-	      auto *live_range = live_ranges->get (def);
+	      live_range = live_ranges->get (def);
 	      if (!live_range)
 		continue;
 
@@ -430,6 +468,43 @@ update_local_live_ranges (
 				 end, (*live_range).second);
 	    }
 	}
+      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+	{
+	  if (!(is_gimple_assign (gsi_stmt (si))
+		|| is_gimple_call (gsi_stmt (si))))
+	    continue;
+	  stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
+	  enum stmt_vec_info_type type
+	    = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+	  if (non_contiguous_memory_access_p (stmt_info))
+	    {
+	      /* For non-adjacent load/store STMT, we will potentially
+		 convert it into:
+
+		   1. MASK_LEN_GATHER_LOAD (..., perm indice).
+		   2. Continguous load/store + VEC_PERM (..., perm indice)
+
+		We will be likely using one more vector variable.  */
+	      unsigned int max_point
+		= (*program_points_per_bb.get (bb)).length () - 1;
+	      auto *live_ranges = live_ranges_per_bb.get (bb);
+	      bool existed_p = false;
+	      tree var = type == load_vec_info_type
+			   ? gimple_get_lhs (gsi_stmt (si))
+			   : get_store_value (gsi_stmt (si));
+	      tree sel_type = build_nonstandard_integer_type (
+		TYPE_PRECISION (TREE_TYPE (var)), 1);
+	      tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+				     get_identifier ("vect_perm"), sel_type);
+	      pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
+	      gcc_assert (!existed_p);
+	      live_range = pair (0, max_point);
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Add perm indice %T, start = 0, end = %d\n",
+				 sel, max_point);
+	    }
+	}
     }
 }
 
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
new file mode 100644
index 000000000000..3dfc6f16a250
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -fdump-tree-vect-details" } */
+
+int
+bar (int *x, int a, int b, int n)
+{
+  x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
+  int sum1 = 0;
+  int sum2 = 0;
+  for (int i = 0; i < n; ++i)
+    {
+      sum1 += x[2*i] - a;
+      sum1 += x[2*i+1] * b;
+      sum2 += x[2*i] - b;
+      sum2 += x[2*i+1] * a;
+    }
+  return sum1 + sum2;
+}
+
+/* { dg-final { scan-assembler {e32,m2} } } */
+/* { dg-final { scan-assembler-times {csrr} 1 } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */