public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/vendors/riscv/heads/gcc-13-with-riscv-opts)] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]
@ 2023-07-14  2:35 Jeff Law
  0 siblings, 0 replies; 2+ messages in thread
From: Jeff Law @ 2023-07-14  2:35 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:0a0c0e58ec030102a2679ab5115ef5e936c7701c

commit 0a0c0e58ec030102a2679ab5115ef5e936c7701c
Author: Kito Cheng <kito.cheng@sifive.com>
Date:   Fri May 12 10:26:06 2023 +0800

    RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]
    
    Rebase to trunk and send V3 patch for:
    https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617821.html
    
    This patch is fixing: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109743.
    
    This issue happens is because we are currently very conservative in optimization of user vsetvli.
    
    Consider this following case:
    
    bb 1:
      vsetvli a5,a4... (demand AVL = a4).
    bb 2:
      RVV insn use a5 (demand AVL = a5).
    
    LCM will hoist vsetvl of bb 2 into bb 1.
    We don't do AVL propagation for this situation since it's complicated that
    we should analyze the code sequence between vsetvli in bb 1 and RVV insn in bb 2.
    They are not necessary the consecutive blocks.
    
    This patch is doing the optimizations after LCM, we will check and eliminate the vsetvli
    in LCM inserted edge if such vsetvli is redundant. Such approach is much simplier and safe.
    
    code:
    void
    foo2 (int32_t *a, int32_t *b, int n)
    {
      if (n <= 0)
          return;
      int i = n;
      size_t vl = __riscv_vsetvl_e32m1 (i);
    
      for (; i >= 0; i--)
      {
        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
        __riscv_vse32_v_i32m1 (b, v, vl);
    
        if (i >= vl)
          continue;
    
        if (i == 0)
          return;
    
        vl = __riscv_vsetvl_e32m1 (i);
      }
    }
    
    Before this patch:
    foo2:
    .LFB2:
            .cfi_startproc
            ble     a2,zero,.L1
            mv      a4,a2
            li      a3,-1
            vsetvli a5,a2,e32,m1,ta,mu
            vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
    .L5:
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            bgeu    a4,a5,.L3
    .L10:
            beq     a2,zero,.L1
            vsetvli a5,a4,e32,m1,ta,mu
            addi    a4,a4,-1
            vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            addiw   a2,a2,-1
            bltu    a4,a5,.L10
    .L3:
            addiw   a2,a2,-1
            addi    a4,a4,-1
            bne     a2,a3,.L5
    .L1:
            ret
    
    After this patch:
    f:
            ble     a2,zero,.L1
            mv      a4,a2
            li      a3,-1
            vsetvli a5,a2,e32,m1,ta,ma
    .L5:
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            bgeu    a4,a5,.L3
    .L10:
            beq     a2,zero,.L1
            vsetvli a5,a4,e32,m1,ta,ma
            addi    a4,a4,-1
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            addiw   a2,a2,-1
            bltu    a4,a5,.L10
    .L3:
            addiw   a2,a2,-1
            addi    a4,a4,-1
            bne     a2,a3,.L5
    .L1:
            ret
    
            PR target/109743
    
    gcc/ChangeLog:
    
            * config/riscv/riscv-vsetvl.cc (pass_vsetvl::get_vsetvl_at_end): New.
            (local_avl_compatible_p): New.
            (pass_vsetvl::local_eliminate_vsetvl_insn): Enhance local optimizations
            for LCM, rewrite as a backward algorithm.
            (pass_vsetvl::cleanup_insns): Use new local_eliminate_vsetvl_insn
            interface, handle a BB at once.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/riscv/rvv/vsetvl/pr109743-1.c: New test.
            * gcc.target/riscv/rvv/vsetvl/pr109743-2.c: New test.
            * gcc.target/riscv/rvv/vsetvl/pr109743-3.c: New test.
            * gcc.target/riscv/rvv/vsetvl/pr109743-4.c: New test.
    
    Co-authored-by: Juzhe-Zhong <juzhe.zhong@rivai.ai>

Diff:
---
 gcc/config/riscv/riscv-vsetvl.cc                   | 213 ++++++++++++++++-----
 .../gcc.target/riscv/rvv/vsetvl/pr109743-1.c       |  26 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-2.c       |  27 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-3.c       |  28 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-4.c       |  28 +++
 5 files changed, 277 insertions(+), 45 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 5530040bd73..55b5aac3c27 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2670,7 +2670,8 @@ private:
   void pre_vsetvl (void);
 
   /* Phase 5.  */
-  void local_eliminate_vsetvl_insn (const vector_insn_info &) const;
+  rtx_insn *get_vsetvl_at_end (const bb_info *, vector_insn_info *) const;
+  void local_eliminate_vsetvl_insn (const bb_info *) const;
   void cleanup_insns (void) const;
 
   /* Phase 6.  */
@@ -4032,6 +4033,60 @@ pass_vsetvl::pre_vsetvl (void)
     commit_edge_insertions ();
 }
 
+/* Some instruction can not be accessed in RTL_SSA when we don't re-init
+   the new RTL_SSA framework but it is definetely at the END of the block.
+
+  Here we optimize the VSETVL is hoisted by LCM:
+
+   Before LCM:
+     bb 1:
+       vsetvli a5,a2,e32,m1,ta,mu
+     bb 2:
+       vsetvli zero,a5,e32,m1,ta,mu
+       ...
+
+   After LCM:
+     bb 1:
+       vsetvli a5,a2,e32,m1,ta,mu
+       LCM INSERTED: vsetvli zero,a5,e32,m1,ta,mu --> eliminate
+     bb 2:
+       ...
+   */
+rtx_insn *
+pass_vsetvl::get_vsetvl_at_end (const bb_info *bb, vector_insn_info *dem) const
+{
+  rtx_insn *end_vsetvl = BB_END (bb->cfg_bb ());
+  if (end_vsetvl && NONDEBUG_INSN_P (end_vsetvl))
+    {
+      if (JUMP_P (end_vsetvl))
+	end_vsetvl = PREV_INSN (end_vsetvl);
+
+      if (NONDEBUG_INSN_P (end_vsetvl)
+	  && vsetvl_discard_result_insn_p (end_vsetvl))
+	{
+	  /* Only handle single succ. here, multiple succ. is much
+	     more complicated.  */
+	  if (single_succ_p (bb->cfg_bb ()))
+	    {
+	      edge e = single_succ_edge (bb->cfg_bb ());
+	      *dem = get_block_info (e->dest).local_dem;
+	      return end_vsetvl;
+	    }
+	}
+    }
+  return nullptr;
+}
+
+/* This predicator should only used within same basic block.  */
+static bool
+local_avl_compatible_p (rtx avl1, rtx avl2)
+{
+  if (!REG_P (avl1) || !REG_P (avl2))
+    return false;
+
+  return REGNO (avl1) == REGNO (avl2);
+}
+
 /* Local user vsetvl optimizaiton:
 
      Case 1:
@@ -4044,45 +4099,122 @@ pass_vsetvl::pre_vsetvl (void)
        ...
        vsetvl zero,a5,e32,mf2 --> Eliminate directly.  */
 void
-pass_vsetvl::local_eliminate_vsetvl_insn (const vector_insn_info &dem) const
-{
-  const insn_info *insn = dem.get_insn ();
-  if (!insn || insn->is_artificial ())
-    return;
-  rtx_insn *rinsn = insn->rtl ();
-  const bb_info *bb = insn->bb ();
-  if (vsetvl_insn_p (rinsn))
+pass_vsetvl::local_eliminate_vsetvl_insn (const bb_info *bb) const
+{
+  rtx_insn *prev_vsetvl = nullptr;
+  rtx_insn *curr_vsetvl = nullptr;
+  rtx vl_placeholder = RVV_VLMAX;
+  rtx prev_avl = vl_placeholder;
+  rtx curr_avl = vl_placeholder;
+  vector_insn_info prev_dem;
+
+  /* Instruction inserted by LCM is not appeared in RTL-SSA yet, try to
+     found those instruciton.   */
+  if (rtx_insn *end_vsetvl = get_vsetvl_at_end (bb, &prev_dem))
     {
-      rtx vl = get_vl (rinsn);
-      for (insn_info *i = insn->next_nondebug_insn ();
-	   real_insn_and_same_bb_p (i, bb); i = i->next_nondebug_insn ())
+      prev_avl = get_avl (end_vsetvl);
+      prev_vsetvl = end_vsetvl;
+    }
+
+  bool skip_one = false;
+  /* Backward propgate vsetvl info, drop the later one (prev_vsetvl) if it's
+     compatible with current vsetvl (curr_avl), and merge the vtype and avl
+     info. into current vsetvl.  */
+  for (insn_info *insn : bb->reverse_real_nondebug_insns ())
+    {
+      rtx_insn *rinsn = insn->rtl ();
+      const auto &curr_dem = get_vector_info (insn);
+      bool need_invalidate = false;
+
+      /* Skip if this insn already handled in last iteration.  */
+      if (skip_one)
+	{
+	  skip_one = false;
+	  continue;
+	}
+
+      if (vsetvl_insn_p (rinsn))
+	{
+	  curr_vsetvl = rinsn;
+	  /* vsetvl are using vl rather than avl since it will try to merge
+	     with other vsetvl_discard_result.
+
+			v--- avl
+	     vsetvl a5,a4,e8,mf8   # vsetvl
+	     ...    ^--- vl
+	     vsetvl zero,a5,e8,mf8 # vsetvl_discard_result
+			 ^--- avl
+	     */
+	  curr_avl = get_vl (rinsn);
+	  /* vsetvl is a cut point of local backward vsetvl elimination.  */
+	  need_invalidate = true;
+	}
+      else if (has_vtype_op (rinsn) && NONDEBUG_INSN_P (PREV_INSN (rinsn))
+	       && (vsetvl_discard_result_insn_p (PREV_INSN (rinsn))
+		   || vsetvl_insn_p (PREV_INSN (rinsn))))
 	{
-	  if (i->is_call () || i->is_asm ()
-	      || find_access (i->defs (), VL_REGNUM)
-	      || find_access (i->defs (), VTYPE_REGNUM))
-	    return;
+	  curr_vsetvl = PREV_INSN (rinsn);
 
-	  if (has_vtype_op (i->rtl ()))
+	  if (vsetvl_insn_p (PREV_INSN (rinsn)))
 	    {
-	      if (!vsetvl_discard_result_insn_p (PREV_INSN (i->rtl ())))
-		return;
-	      rtx avl = get_avl (i->rtl ());
-	      if (avl != vl)
-		return;
-	      set_info *def = find_access (i->uses (), REGNO (avl))->def ();
-	      if (def->insn () != insn)
-		return;
-
-	      vector_insn_info new_info = get_vector_info (i);
-	      if (!new_info.skip_avl_compatible_p (dem))
-		return;
-
-	      new_info.set_avl_info (dem.get_avl_info ());
-	      new_info = dem.merge (new_info, LOCAL_MERGE);
-	      change_vsetvl_insn (insn, new_info);
-	      eliminate_insn (PREV_INSN (i->rtl ()));
-	      return;
+	      /* Need invalidate and skip if it's vsetvl.  */
+	      need_invalidate = true;
+	      /* vsetvl_discard_result_insn_p won't appeared in RTL-SSA,
+	       * so only need to skip for vsetvl.  */
+	      skip_one = true;
 	    }
+
+	  curr_avl = get_avl (rinsn);
+
+	  /* Some instrucion like pred_extract_first<mode> don't reqruie avl, so
+	     the avl is null, use vl_placeholder for unify the handling
+	     logic. */
+	  if (!curr_avl)
+	    curr_avl = vl_placeholder;
+	}
+      else if (insn->is_call () || insn->is_asm ()
+	       || find_access (insn->defs (), VL_REGNUM)
+	       || find_access (insn->defs (), VTYPE_REGNUM)
+	       || (REG_P (prev_avl)
+		   && find_access (insn->defs (), REGNO (prev_avl))))
+	{
+	  /* Invalidate if this insn can't propagate vl, vtype or avl.  */
+	  need_invalidate = true;
+	  prev_dem = vector_insn_info ();
+	}
+      else
+	/* Not interested instruction.  */
+	continue;
+
+      /* Local AVL compatibility checking is simpler than global, we only
+	 need to check the REGNO is same.  */
+      if (prev_dem.valid_p () && prev_dem.skip_avl_compatible_p (curr_dem)
+	  && local_avl_compatible_p (prev_avl, curr_avl))
+	{
+	  /* curr_dem and prev_dem is compatible!  */
+	  /* Update avl info since we need to make sure they are fully
+	     compatible before merge.  */
+	  prev_dem.set_avl_info (curr_dem.get_avl_info ());
+	  /* Merge both and update into curr_vsetvl.  */
+	  prev_dem = curr_dem.merge (prev_dem, LOCAL_MERGE);
+	  change_vsetvl_insn (curr_dem.get_insn (), prev_dem);
+	  /* Then we can drop prev_vsetvl.  */
+	  eliminate_insn (prev_vsetvl);
+	}
+
+      if (need_invalidate)
+	{
+	  prev_vsetvl = nullptr;
+	  curr_vsetvl = nullptr;
+	  prev_avl = vl_placeholder;
+	  curr_avl = vl_placeholder;
+	  prev_dem = vector_insn_info ();
+	}
+      else
+	{
+	  prev_vsetvl = curr_vsetvl;
+	  prev_avl = curr_avl;
+	  prev_dem = curr_dem;
 	}
     }
 }
@@ -4107,19 +4239,10 @@ pass_vsetvl::cleanup_insns (void) const
 {
   for (const bb_info *bb : crtl->ssa->bbs ())
     {
+      local_eliminate_vsetvl_insn (bb);
       for (insn_info *insn : bb->real_nondebug_insns ())
 	{
 	  rtx_insn *rinsn = insn->rtl ();
-	  const auto &dem = get_vector_info (insn);
-	  /* Eliminate local vsetvl:
-	       bb 0:
-	       vsetvl a5,a6,...
-	       vsetvl zero,a5.
-
-	     Eliminate vsetvl in bb2 when a5 is only coming from
-	     bb 0.  */
-	  local_eliminate_vsetvl_insn (dem);
-
 	  if (vlmax_avl_insn_p (rinsn))
 	    {
 	      eliminate_insn (rinsn);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
new file mode 100644
index 00000000000..f30275c8280
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t * a, int32_t * b, int n)
+{
+    if (n <= 0)
+      return;
+    int i = n;
+    size_t vl = __riscv_vsetvl_e32m1 (i);
+    for (; i >= 0; i--)
+      {
+        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
+        __riscv_vse32_v_i32m1 (b, v, vl);
+
+        if (i >= vl)
+          continue;
+        if (i == 0)
+          return;
+        vl = __riscv_vsetvl_e32m1 (i);
+      }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
new file mode 100644
index 00000000000..5f6647bb916
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t * a, int32_t * b, int n)
+{
+    if (n <= 0)
+      return;
+    int i = n;
+    size_t vl = __riscv_vsetvl_e8mf4 (i);
+    for (; i >= 0; i--)
+      {
+        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
+        __riscv_vse32_v_i32m1 (b, v, vl);
+
+        if (i >= vl)
+          continue;
+        if (i == 0)
+          return;
+        vl = __riscv_vsetvl_e32m1 (i);
+      }
+}
+
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
new file mode 100644
index 00000000000..5dbc871ed12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t * a, int32_t * b, int n)
+{
+    if (n <= 0)
+      return;
+    int i = n;
+    size_t vl = __riscv_vsetvl_e8mf2 (i);
+    for (; i >= 0; i--)
+      {
+        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
+        __riscv_vse32_v_i32m1 (b, v, vl);
+
+        if (i >= vl)
+          continue;
+        if (i == 0)
+          return;
+        vl = __riscv_vsetvl_e32m1 (i);
+      }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e8,\s*mf2,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c
new file mode 100644
index 00000000000..edd12855f58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void
+f (int32_t *a, int32_t *b, int n)
+{
+  if (n <= 0)
+    return;
+  int i = n;
+  size_t vl = __riscv_vsetvl_e8mf4 (i);
+  for (; i >= 0; i--)
+    {
+      vint32m1_t v = __riscv_vle32_v_i32m1 (a + i, vl);
+      v = __riscv_vle32_v_i32m1_tu (v, a + i + 100, vl);
+      __riscv_vse32_v_i32m1 (b + i, v, vl);
+
+      if (i >= vl)
+	continue;
+      if (i == 0)
+	return;
+      vl = __riscv_vsetvl_e8mf4 (i);
+    }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*tu,\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [gcc(refs/vendors/riscv/heads/gcc-13-with-riscv-opts)] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]
@ 2023-05-25 23:19 Jeff Law
  0 siblings, 0 replies; 2+ messages in thread
From: Jeff Law @ 2023-05-25 23:19 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:8e8e6f4a0b9519571cb72200b73bc7fb84bf08cf

commit 8e8e6f4a0b9519571cb72200b73bc7fb84bf08cf
Author: Kito Cheng <kito.cheng@sifive.com>
Date:   Fri May 12 10:26:06 2023 +0800

    RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]
    
    Rebase to trunk and send V3 patch for:
    https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617821.html
    
    This patch is fixing: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109743.
    
    This issue happens is because we are currently very conservative in optimization of user vsetvli.
    
    Consider this following case:
    
    bb 1:
      vsetvli a5,a4... (demand AVL = a4).
    bb 2:
      RVV insn use a5 (demand AVL = a5).
    
    LCM will hoist vsetvl of bb 2 into bb 1.
    We don't do AVL propagation for this situation since it's complicated that
    we should analyze the code sequence between vsetvli in bb 1 and RVV insn in bb 2.
    They are not necessary the consecutive blocks.
    
    This patch is doing the optimizations after LCM, we will check and eliminate the vsetvli
    in LCM inserted edge if such vsetvli is redundant. Such approach is much simplier and safe.
    
    code:
    void
    foo2 (int32_t *a, int32_t *b, int n)
    {
      if (n <= 0)
          return;
      int i = n;
      size_t vl = __riscv_vsetvl_e32m1 (i);
    
      for (; i >= 0; i--)
      {
        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
        __riscv_vse32_v_i32m1 (b, v, vl);
    
        if (i >= vl)
          continue;
    
        if (i == 0)
          return;
    
        vl = __riscv_vsetvl_e32m1 (i);
      }
    }
    
    Before this patch:
    foo2:
    .LFB2:
            .cfi_startproc
            ble     a2,zero,.L1
            mv      a4,a2
            li      a3,-1
            vsetvli a5,a2,e32,m1,ta,mu
            vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
    .L5:
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            bgeu    a4,a5,.L3
    .L10:
            beq     a2,zero,.L1
            vsetvli a5,a4,e32,m1,ta,mu
            addi    a4,a4,-1
            vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            addiw   a2,a2,-1
            bltu    a4,a5,.L10
    .L3:
            addiw   a2,a2,-1
            addi    a4,a4,-1
            bne     a2,a3,.L5
    .L1:
            ret
    
    After this patch:
    f:
            ble     a2,zero,.L1
            mv      a4,a2
            li      a3,-1
            vsetvli a5,a2,e32,m1,ta,ma
    .L5:
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            bgeu    a4,a5,.L3
    .L10:
            beq     a2,zero,.L1
            vsetvli a5,a4,e32,m1,ta,ma
            addi    a4,a4,-1
            vle32.v v1,0(a0)
            vse32.v v1,0(a1)
            addiw   a2,a2,-1
            bltu    a4,a5,.L10
    .L3:
            addiw   a2,a2,-1
            addi    a4,a4,-1
            bne     a2,a3,.L5
    .L1:
            ret
    
            PR target/109743
    
    gcc/ChangeLog:
    
            * config/riscv/riscv-vsetvl.cc (pass_vsetvl::get_vsetvl_at_end): New.
            (local_avl_compatible_p): New.
            (pass_vsetvl::local_eliminate_vsetvl_insn): Enhance local optimizations
            for LCM, rewrite as a backward algorithm.
            (pass_vsetvl::cleanup_insns): Use new local_eliminate_vsetvl_insn
            interface, handle a BB at once.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/riscv/rvv/vsetvl/pr109743-1.c: New test.
            * gcc.target/riscv/rvv/vsetvl/pr109743-2.c: New test.
            * gcc.target/riscv/rvv/vsetvl/pr109743-3.c: New test.
            * gcc.target/riscv/rvv/vsetvl/pr109743-4.c: New test.
    
    Co-authored-by: Juzhe-Zhong <juzhe.zhong@rivai.ai>

Diff:
---
 gcc/config/riscv/riscv-vsetvl.cc                   | 213 ++++++++++++++++-----
 .../gcc.target/riscv/rvv/vsetvl/pr109743-1.c       |  26 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-2.c       |  27 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-3.c       |  28 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-4.c       |  28 +++
 5 files changed, 277 insertions(+), 45 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 20d43372d33..f1c47e8f9be 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2667,7 +2667,8 @@ private:
   void pre_vsetvl (void);
 
   /* Phase 5.  */
-  void local_eliminate_vsetvl_insn (const vector_insn_info &) const;
+  rtx_insn *get_vsetvl_at_end (const bb_info *, vector_insn_info *) const;
+  void local_eliminate_vsetvl_insn (const bb_info *) const;
   void cleanup_insns (void) const;
 
   /* Phase 6.  */
@@ -4029,6 +4030,60 @@ pass_vsetvl::pre_vsetvl (void)
     commit_edge_insertions ();
 }
 
+/* Some instruction can not be accessed in RTL_SSA when we don't re-init
+   the new RTL_SSA framework but it is definetely at the END of the block.
+
+  Here we optimize the VSETVL is hoisted by LCM:
+
+   Before LCM:
+     bb 1:
+       vsetvli a5,a2,e32,m1,ta,mu
+     bb 2:
+       vsetvli zero,a5,e32,m1,ta,mu
+       ...
+
+   After LCM:
+     bb 1:
+       vsetvli a5,a2,e32,m1,ta,mu
+       LCM INSERTED: vsetvli zero,a5,e32,m1,ta,mu --> eliminate
+     bb 2:
+       ...
+   */
+rtx_insn *
+pass_vsetvl::get_vsetvl_at_end (const bb_info *bb, vector_insn_info *dem) const
+{
+  rtx_insn *end_vsetvl = BB_END (bb->cfg_bb ());
+  if (end_vsetvl && NONDEBUG_INSN_P (end_vsetvl))
+    {
+      if (JUMP_P (end_vsetvl))
+	end_vsetvl = PREV_INSN (end_vsetvl);
+
+      if (NONDEBUG_INSN_P (end_vsetvl)
+	  && vsetvl_discard_result_insn_p (end_vsetvl))
+	{
+	  /* Only handle single succ. here, multiple succ. is much
+	     more complicated.  */
+	  if (single_succ_p (bb->cfg_bb ()))
+	    {
+	      edge e = single_succ_edge (bb->cfg_bb ());
+	      *dem = get_block_info (e->dest).local_dem;
+	      return end_vsetvl;
+	    }
+	}
+    }
+  return nullptr;
+}
+
+/* This predicator should only used within same basic block.  */
+static bool
+local_avl_compatible_p (rtx avl1, rtx avl2)
+{
+  if (!REG_P (avl1) || !REG_P (avl2))
+    return false;
+
+  return REGNO (avl1) == REGNO (avl2);
+}
+
 /* Local user vsetvl optimizaiton:
 
      Case 1:
@@ -4041,45 +4096,122 @@ pass_vsetvl::pre_vsetvl (void)
        ...
        vsetvl zero,a5,e32,mf2 --> Eliminate directly.  */
 void
-pass_vsetvl::local_eliminate_vsetvl_insn (const vector_insn_info &dem) const
-{
-  const insn_info *insn = dem.get_insn ();
-  if (!insn || insn->is_artificial ())
-    return;
-  rtx_insn *rinsn = insn->rtl ();
-  const bb_info *bb = insn->bb ();
-  if (vsetvl_insn_p (rinsn))
+pass_vsetvl::local_eliminate_vsetvl_insn (const bb_info *bb) const
+{
+  rtx_insn *prev_vsetvl = nullptr;
+  rtx_insn *curr_vsetvl = nullptr;
+  rtx vl_placeholder = RVV_VLMAX;
+  rtx prev_avl = vl_placeholder;
+  rtx curr_avl = vl_placeholder;
+  vector_insn_info prev_dem;
+
+  /* Instruction inserted by LCM is not appeared in RTL-SSA yet, try to
+     found those instruciton.   */
+  if (rtx_insn *end_vsetvl = get_vsetvl_at_end (bb, &prev_dem))
     {
-      rtx vl = get_vl (rinsn);
-      for (insn_info *i = insn->next_nondebug_insn ();
-	   real_insn_and_same_bb_p (i, bb); i = i->next_nondebug_insn ())
+      prev_avl = get_avl (end_vsetvl);
+      prev_vsetvl = end_vsetvl;
+    }
+
+  bool skip_one = false;
+  /* Backward propgate vsetvl info, drop the later one (prev_vsetvl) if it's
+     compatible with current vsetvl (curr_avl), and merge the vtype and avl
+     info. into current vsetvl.  */
+  for (insn_info *insn : bb->reverse_real_nondebug_insns ())
+    {
+      rtx_insn *rinsn = insn->rtl ();
+      const auto &curr_dem = get_vector_info (insn);
+      bool need_invalidate = false;
+
+      /* Skip if this insn already handled in last iteration.  */
+      if (skip_one)
+	{
+	  skip_one = false;
+	  continue;
+	}
+
+      if (vsetvl_insn_p (rinsn))
+	{
+	  curr_vsetvl = rinsn;
+	  /* vsetvl are using vl rather than avl since it will try to merge
+	     with other vsetvl_discard_result.
+
+			v--- avl
+	     vsetvl a5,a4,e8,mf8   # vsetvl
+	     ...    ^--- vl
+	     vsetvl zero,a5,e8,mf8 # vsetvl_discard_result
+			 ^--- avl
+	     */
+	  curr_avl = get_vl (rinsn);
+	  /* vsetvl is a cut point of local backward vsetvl elimination.  */
+	  need_invalidate = true;
+	}
+      else if (has_vtype_op (rinsn) && NONDEBUG_INSN_P (PREV_INSN (rinsn))
+	       && (vsetvl_discard_result_insn_p (PREV_INSN (rinsn))
+		   || vsetvl_insn_p (PREV_INSN (rinsn))))
 	{
-	  if (i->is_call () || i->is_asm ()
-	      || find_access (i->defs (), VL_REGNUM)
-	      || find_access (i->defs (), VTYPE_REGNUM))
-	    return;
+	  curr_vsetvl = PREV_INSN (rinsn);
 
-	  if (has_vtype_op (i->rtl ()))
+	  if (vsetvl_insn_p (PREV_INSN (rinsn)))
 	    {
-	      if (!vsetvl_discard_result_insn_p (PREV_INSN (i->rtl ())))
-		return;
-	      rtx avl = get_avl (i->rtl ());
-	      if (avl != vl)
-		return;
-	      set_info *def = find_access (i->uses (), REGNO (avl))->def ();
-	      if (def->insn () != insn)
-		return;
-
-	      vector_insn_info new_info = get_vector_info (i);
-	      if (!new_info.skip_avl_compatible_p (dem))
-		return;
-
-	      new_info.set_avl_info (dem.get_avl_info ());
-	      new_info = dem.merge (new_info, LOCAL_MERGE);
-	      change_vsetvl_insn (insn, new_info);
-	      eliminate_insn (PREV_INSN (i->rtl ()));
-	      return;
+	      /* Need invalidate and skip if it's vsetvl.  */
+	      need_invalidate = true;
+	      /* vsetvl_discard_result_insn_p won't appeared in RTL-SSA,
+	       * so only need to skip for vsetvl.  */
+	      skip_one = true;
 	    }
+
+	  curr_avl = get_avl (rinsn);
+
+	  /* Some instrucion like pred_extract_first<mode> don't reqruie avl, so
+	     the avl is null, use vl_placeholder for unify the handling
+	     logic. */
+	  if (!curr_avl)
+	    curr_avl = vl_placeholder;
+	}
+      else if (insn->is_call () || insn->is_asm ()
+	       || find_access (insn->defs (), VL_REGNUM)
+	       || find_access (insn->defs (), VTYPE_REGNUM)
+	       || (REG_P (prev_avl)
+		   && find_access (insn->defs (), REGNO (prev_avl))))
+	{
+	  /* Invalidate if this insn can't propagate vl, vtype or avl.  */
+	  need_invalidate = true;
+	  prev_dem = vector_insn_info ();
+	}
+      else
+	/* Not interested instruction.  */
+	continue;
+
+      /* Local AVL compatibility checking is simpler than global, we only
+	 need to check the REGNO is same.  */
+      if (prev_dem.valid_p () && prev_dem.skip_avl_compatible_p (curr_dem)
+	  && local_avl_compatible_p (prev_avl, curr_avl))
+	{
+	  /* curr_dem and prev_dem is compatible!  */
+	  /* Update avl info since we need to make sure they are fully
+	     compatible before merge.  */
+	  prev_dem.set_avl_info (curr_dem.get_avl_info ());
+	  /* Merge both and update into curr_vsetvl.  */
+	  prev_dem = curr_dem.merge (prev_dem, LOCAL_MERGE);
+	  change_vsetvl_insn (curr_dem.get_insn (), prev_dem);
+	  /* Then we can drop prev_vsetvl.  */
+	  eliminate_insn (prev_vsetvl);
+	}
+
+      if (need_invalidate)
+	{
+	  prev_vsetvl = nullptr;
+	  curr_vsetvl = nullptr;
+	  prev_avl = vl_placeholder;
+	  curr_avl = vl_placeholder;
+	  prev_dem = vector_insn_info ();
+	}
+      else
+	{
+	  prev_vsetvl = curr_vsetvl;
+	  prev_avl = curr_avl;
+	  prev_dem = curr_dem;
 	}
     }
 }
@@ -4104,19 +4236,10 @@ pass_vsetvl::cleanup_insns (void) const
 {
   for (const bb_info *bb : crtl->ssa->bbs ())
     {
+      local_eliminate_vsetvl_insn (bb);
       for (insn_info *insn : bb->real_nondebug_insns ())
 	{
 	  rtx_insn *rinsn = insn->rtl ();
-	  const auto &dem = get_vector_info (insn);
-	  /* Eliminate local vsetvl:
-	       bb 0:
-	       vsetvl a5,a6,...
-	       vsetvl zero,a5.
-
-	     Eliminate vsetvl in bb2 when a5 is only coming from
-	     bb 0.  */
-	  local_eliminate_vsetvl_insn (dem);
-
 	  if (vlmax_avl_insn_p (rinsn))
 	    {
 	      eliminate_insn (rinsn);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
new file mode 100644
index 00000000000..f30275c8280
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t * a, int32_t * b, int n)
+{
+    if (n <= 0)
+      return;
+    int i = n;
+    size_t vl = __riscv_vsetvl_e32m1 (i);
+    for (; i >= 0; i--)
+      {
+        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
+        __riscv_vse32_v_i32m1 (b, v, vl);
+
+        if (i >= vl)
+          continue;
+        if (i == 0)
+          return;
+        vl = __riscv_vsetvl_e32m1 (i);
+      }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
new file mode 100644
index 00000000000..5f6647bb916
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t * a, int32_t * b, int n)
+{
+    if (n <= 0)
+      return;
+    int i = n;
+    size_t vl = __riscv_vsetvl_e8mf4 (i);
+    for (; i >= 0; i--)
+      {
+        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
+        __riscv_vse32_v_i32m1 (b, v, vl);
+
+        if (i >= vl)
+          continue;
+        if (i == 0)
+          return;
+        vl = __riscv_vsetvl_e32m1 (i);
+      }
+}
+
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
new file mode 100644
index 00000000000..5dbc871ed12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t * a, int32_t * b, int n)
+{
+    if (n <= 0)
+      return;
+    int i = n;
+    size_t vl = __riscv_vsetvl_e8mf2 (i);
+    for (; i >= 0; i--)
+      {
+        vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
+        __riscv_vse32_v_i32m1 (b, v, vl);
+
+        if (i >= vl)
+          continue;
+        if (i == 0)
+          return;
+        vl = __riscv_vsetvl_e32m1 (i);
+      }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e8,\s*mf2,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c
new file mode 100644
index 00000000000..edd12855f58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void
+f (int32_t *a, int32_t *b, int n)
+{
+  if (n <= 0)
+    return;
+  int i = n;
+  size_t vl = __riscv_vsetvl_e8mf4 (i);
+  for (; i >= 0; i--)
+    {
+      vint32m1_t v = __riscv_vle32_v_i32m1 (a + i, vl);
+      v = __riscv_vle32_v_i32m1_tu (v, a + i + 100, vl);
+      __riscv_vse32_v_i32m1 (b + i, v, vl);
+
+      if (i >= vl)
+	continue;
+      if (i == 0)
+	return;
+      vl = __riscv_vsetvl_e8mf4 (i);
+    }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*tu,\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-07-14  2:35 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-14  2:35 [gcc(refs/vendors/riscv/heads/gcc-13-with-riscv-opts)] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743] Jeff Law
  -- strict thread matches above, loose matches on Subject: below --
2023-05-25 23:19 Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).