From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <kito@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2093)
	id BAE0A3858C50; Fri, 21 Apr 2023 09:49:39 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BAE0A3858C50
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1682070579;
	bh=hsvZ4GwHodX54dm+IdfMyBxMhe2qqNVWun4dQhwsh9s=;
	h=From:To:Subject:Date:From;
	b=rTjMRNkyD5UWjQypfFZfB6bKPBA6KLtqcpsX7Oz7ZHveLb0O+cYn9MlJANkNI0d1L
	 ANEKRHb7UWflvFoasJZ9VWb2L5Z4RaMCI2meZJ39os6diyTPBKXzQzcb07+ABR1pDR
	 jMfLWoEaZkTacYfE7P6TAH0B8Nj9PC0AUTlIPBSg=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Kito Cheng <kito@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-135] RISC-V: Defer vsetvli insertion to later if possible
 [PR108270]
X-Act-Checkin: gcc
X-Git-Author: Juzhe-Zhong <juzhe.zhong@rivai.ai>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 98d66b204932e343bbf940990914b949e8fccbd5
X-Git-Newrev: d06e9264b0192c2c77e07d7fb0fe090efcb510c0
Message-Id: <20230421094939.BAE0A3858C50@sourceware.org>
Date: Fri, 21 Apr 2023 09:49:39 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:d06e9264b0192c2c77e07d7fb0fe090efcb510c0

commit r14-135-gd06e9264b0192c2c77e07d7fb0fe090efcb510c0
Author: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Date:   Fri Apr 21 17:19:12 2023 +0800

    RISC-V: Defer vsetvli insertion to later if possible [PR108270]
    
    Fix issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108270.
    
    Consider the following testcase:
    void f (void * restrict in, void * restrict out, int l, int n, int m)
    {
      for (int i = 0; i < l; i++){
        for (int j = 0; j < m; j++){
          for (int k = 0; k < n; k++)
            {
              vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
              __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
            }
        }
      }
    }
    
    Compile option: -O3
    
    Before this patch:
            mv      a7,a2
            mv      a6,a0
            mv      t1,a1
            mv      a2,a3
            vsetivli        zero,17,e8,mf8,ta,ma
            ble     a7,zero,.L1
            ble     a4,zero,.L1
            ble     a3,zero,.L1
    ...
    
    After this patch:
            mv      a7,a2
            mv      a6,a0
            mv      t1,a1
            mv      a2,a3
            ble     a7,zero,.L1
            ble     a4,zero,.L1
            ble     a3,zero,.L1
            add     a1,a0,a4
            li      a0,0
            vsetivli        zero,17,e8,mf8,ta,ma
    ...
    
    This issue is a missed optmization produced by Phase 3 global backward demand
    fusion instead of LCM.
    
    This patch is fixing poor placement of the vsetvl.
    
    This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand info
    backward fusion and propogation) which
    is I introduced into VSETVL PASS to enhance LCM && improve vsetvl instruction
    performance.
    
    This patch is to supress the Phase 3 too aggressive backward fusion and
    propagation to the top of the function program
    when there is no define instruction of AVL (AVL is 0 ~ 31 imm since vsetivli
    instruction allows imm value instead of reg).
    
    You may want to ask why we need Phase 3 to the job.
    Well, we have so many situations that pure LCM fails to optimize, here I can
    show you a simple case to demonstrate it:
    
    void f (void * restrict in, void * restrict out, int n, int m, int cond)
    {
      size_t vl = 101;
      for (size_t j = 0; j < m; j++){
        if (cond) {
          for (size_t i = 0; i < n; i++)
            {
              vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, vl);
              __riscv_vse8_v_i8mf8 (out + i, v, vl);
            }
        } else {
          for (size_t i = 0; i < n; i++)
            {
              vint32mf2_t v = __riscv_vle32_v_i32mf2 (in + i + j, vl);
              v = __riscv_vadd_vv_i32mf2 (v,v,vl);
              __riscv_vse32_v_i32mf2 (out + i, v, vl);
            }
        }
      }
    }
    
    You can see:
    The first inner loop needs vsetvli e8 mf8 for vle+vse.
    The second inner loop need vsetvli e32 mf2 for vle+vadd+vse.
    
    If we don't have Phase 3 (Only handled by LCM (Phase 4)), we will end up with :
    
    outerloop:
    ...
    vsetvli e8mf8
    inner loop 1:
    ....
    
    vsetvli e32mf2
    inner loop 2:
    ....
    
    However, if we have Phase 3, Phase 3 is going to fuse the vsetvli e32 mf2 of
    inner loop 2 into vsetvli e8 mf8, then we will end up with this result after
    phase 3:
    
    outerloop:
    ...
    inner loop 1:
    vsetvli e32mf2
    ....
    
    inner loop 2:
    vsetvli e32mf2
    ....
    
    Then, this demand information after phase 3 will be well optimized after phase 4
    (LCM), after Phase 4 result is:
    
    vsetvli e32mf2
    outerloop:
    ...
    inner loop 1:
    ....
    
    inner loop 2:
    ....
    
    You can see this is the optimal codegen after current VSETVL PASS (Phase 3:
    Demand backward fusion and propagation + Phase 4: LCM ). This is a known issue
     when I start to implement VSETVL PASS.
    
    gcc/ChangeLog:
    
            PR target/108270
            * config/riscv/riscv-vsetvl.cc
            (vector_infos_manager::all_empty_predecessor_p): New function.
            (pass_vsetvl::backward_demand_fusion): Ditto.
            * config/riscv/riscv-vsetvl.h: Ditto.
    
    gcc/testsuite/ChangeLog:
    
            PR target/108270
            * gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c: Adapt testcase.
            * gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c: Ditto.
            * gcc.target/riscv/rvv/vsetvl/pr108270.c: New test.

Diff:
---
 gcc/config/riscv/riscv-vsetvl.cc                   | 23 ++++++++++++++++++++++
 gcc/config/riscv/riscv-vsetvl.h                    |  2 ++
 .../gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c    |  2 +-
 .../gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c   |  4 ++--
 .../gcc.target/riscv/rvv/vsetvl/pr108270.c         | 19 ++++++++++++++++++
 5 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 2406931dac0..ac99028df43 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2411,6 +2411,21 @@ vector_infos_manager::get_all_available_exprs (
   return available_list;
 }
 
+bool
+vector_infos_manager::all_empty_predecessor_p (const basic_block cfg_bb) const
+{
+  hash_set<basic_block> pred_cfg_bbs = get_all_predecessors (cfg_bb);
+  for (const basic_block pred_cfg_bb : pred_cfg_bbs)
+    {
+      const auto &pred_block_info = vector_block_infos[pred_cfg_bb->index];
+      if (!pred_block_info.local_dem.valid_or_dirty_p ()
+	  && !pred_block_info.reaching_out.valid_or_dirty_p ())
+	continue;
+      return false;
+    }
+  return true;
+}
+
 bool
 vector_infos_manager::all_same_ratio_p (sbitmap bitdata) const
 {
@@ -3194,6 +3209,14 @@ pass_vsetvl::backward_demand_fusion (void)
       if (!backward_propagate_worthwhile_p (cfg_bb, curr_block_info))
 	continue;
 
+      /* Fix PR108270:
+
+		bb 0 -> bb 1
+	 We don't need to backward fuse VL/VTYPE info from bb 1 to bb 0
+	 if bb 1 is not inside a loop and all predecessors of bb 0 are empty. */
+      if (m_vector_manager->all_empty_predecessor_p (cfg_bb))
+	continue;
+
       edge e;
       edge_iterator ei;
       /* Backward propagate to each predecessor.  */
diff --git a/gcc/config/riscv/riscv-vsetvl.h b/gcc/config/riscv/riscv-vsetvl.h
index 4fe08cfc789..9041eee1281 100644
--- a/gcc/config/riscv/riscv-vsetvl.h
+++ b/gcc/config/riscv/riscv-vsetvl.h
@@ -451,6 +451,8 @@ public:
   /* Return true if all expression set in bitmap are same ratio.  */
   bool all_same_ratio_p (sbitmap) const;
 
+  bool all_empty_predecessor_p (const basic_block) const;
+
   void release (void);
   void create_bitmap_vectors (void);
   void free_bitmap_vectors (void);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c
index cd4ee7dd0d3..ed32a40f5e7 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c
@@ -29,4 +29,4 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, int cond)
   }
 }
 
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c
index 1f7c0f036a2..2fa29c01dbc 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c
@@ -20,7 +20,7 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, int cond)
   }
 }
 
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
 /* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
-/* { dg-final { scan-assembler-times {vsetivli} 1 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli} 2 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
 /* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c
new file mode 100644
index 00000000000..d2ae43bf263
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (void * restrict in, void * restrict out, int l, int n, int m)
+{
+  for (int i = 0; i < l; i++){
+    for (int j = 0; j < m; j++){
+      for (int k = 0; k < n; k++)
+        {
+          vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
+          __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
+        }
+    }
+  }
+}
+
+/* { dg-final { scan-assembler-not {mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+vsetivli} } } */