Re: [PATCH] RISC-V: Finalize VSETVL PASS implementation

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Kito Cheng <kito.cheng@gmail.com>
To: juzhe.zhong@rivai.ai
Cc: gcc-patches@gcc.gnu.org, palmer@dabbelt.com
Subject: Re: [PATCH] RISC-V: Finalize VSETVL PASS implementation
Date: Fri, 27 Jan 2023 20:31:01 +0800	[thread overview]
Message-ID: <CA+yXCZCs4p7uN2O=Hk+=Am0Ra3dvEDk7vN0zb6VxKNe5uEPx8Q@mail.gmail.com> (raw)
In-Reply-To: <20230118032434.71273-1-juzhe.zhong@rivai.ai>

[-- Attachment #1: Type: text/plain, Size: 55577 bytes --]

committed, thanks!

On Wed, Jan 18, 2023 at 11:25 AM <juzhe.zhong@rivai.ai> wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vsetvl.cc (vsetvl_insn_p): Add condition to
> avoid ICE.
>         (vsetvl_discard_result_insn_p): New function.
>         (reg_killed_by_bb_p): rename to find_reg_killed_by.
>         (find_reg_killed_by): New name.
>         (get_vl): allow it to be called by more functions.
>         (has_vsetvl_killed_avl_p): Add condition.
>         (get_avl): allow it to be called by more functions.
>         (insn_should_be_added_p): New function.
>         (get_all_nonphi_defs): Refine function.
>         (get_all_sets): Ditto.
>         (get_same_bb_set): New function.
>         (any_insn_in_bb_p): Ditto.
>         (any_set_in_bb_p): Ditto.
>         (get_vl_vtype_info): Add VLMAX forward optimization.
>         (source_equal_p): Fix issues.
>         (extract_single_source): Refine.
>         (avl_info::multiple_source_equal_p): New function.
>         (avl_info::operator==): Adjust for final version.
>         (vl_vtype_info::operator==): Ditto.
>         (vl_vtype_info::same_avl_p): Ditto.
>         (vector_insn_info::parse_insn): Ditto.
>         (vector_insn_info::available_p): New function.
>         (vector_insn_info::merge): Adjust for final version.
>         (vector_insn_info::dump): Add hard_empty.
>         (pass_vsetvl::hard_empty_block_p): New function.
>         (pass_vsetvl::backward_demand_fusion): Adjust for final version.
>         (pass_vsetvl::forward_demand_fusion): Ditto.
>         (pass_vsetvl::demand_fusion): Ditto.
>         (pass_vsetvl::cleanup_illegal_dirty_blocks): New function.
>         (pass_vsetvl::compute_local_properties): Adjust for final version.
>         (pass_vsetvl::can_refine_vsetvl_p): Ditto.
>         (pass_vsetvl::refine_vsetvls): Ditto.
>         (pass_vsetvl::commit_vsetvls): Ditto.
>         (pass_vsetvl::propagate_avl): New function.
>         (pass_vsetvl::lazy_vsetvl): Adjust for new version.
>         * config/riscv/riscv-vsetvl.h (enum def_type): New enum.
>
> ---
>  gcc/config/riscv/riscv-vsetvl.cc | 930 +++++++++++++++++++++++--------
>  gcc/config/riscv/riscv-vsetvl.h  |  30 +-
>  2 files changed, 737 insertions(+), 223 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc
> b/gcc/config/riscv/riscv-vsetvl.cc
> index b33c198bbd6..253bfc7b210 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -54,6 +54,8 @@ along with GCC; see the file COPYING3.  If not see
>         used any more and VL operand of VSETVL instruction if it is not
> used by
>         any non-debug instructions.
>
> +    -  Phase 6 - Propagate AVL between vsetvl instructions.
> +
>      Implementation:
>
>      -  The subroutine of optimize == 0 is simple_vsetvl.
> @@ -175,8 +177,20 @@ vector_config_insn_p (rtx_insn *rinsn)
>  static bool
>  vsetvl_insn_p (rtx_insn *rinsn)
>  {
> +  if (!vector_config_insn_p (rinsn))
> +    return false;
>    return (INSN_CODE (rinsn) == CODE_FOR_vsetvldi
> -        || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi);
> +         || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi);
> +}
> +
> +/* Return true if it is vsetvl zero, rs1.  */
> +static bool
> +vsetvl_discard_result_insn_p (rtx_insn *rinsn)
> +{
> +  if (!vector_config_insn_p (rinsn))
> +    return false;
> +  return (INSN_CODE (rinsn) == CODE_FOR_vsetvl_discard_resultdi
> +         || INSN_CODE (rinsn) == CODE_FOR_vsetvl_discard_resultsi);
>  }
>
>  static bool
> @@ -191,15 +205,27 @@ before_p (const insn_info *insn1, const insn_info
> *insn2)
>    return insn1->compare_with (insn2) < 0;
>  }
>
> -static bool
> -reg_killed_by_bb_p (const bb_info *bb, rtx x)
> +static insn_info *
> +find_reg_killed_by (const bb_info *bb, rtx x)
>  {
> -  if (!x || vlmax_avl_p (x))
> -    return false;
> -  for (const insn_info *insn : bb->real_nondebug_insns ())
> +  if (!x || vlmax_avl_p (x) || !REG_P (x))
> +    return nullptr;
> +  for (insn_info *insn : bb->reverse_real_nondebug_insns ())
>      if (find_access (insn->defs (), REGNO (x)))
> -      return true;
> -  return false;
> +      return insn;
> +  return nullptr;
> +}
> +
> +/* Helper function to get VL operand.  */
> +static rtx
> +get_vl (rtx_insn *rinsn)
> +{
> +  if (has_vl_op (rinsn))
> +    {
> +      extract_insn_cached (rinsn);
> +      return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> +    }
> +  return SET_DEST (XVECEXP (PATTERN (rinsn), 0, 0));
>  }
>
>  static bool
> @@ -208,6 +234,9 @@ has_vsetvl_killed_avl_p (const bb_info *bb, const
> vector_insn_info &info)
>    if (info.dirty_with_killed_avl_p ())
>      {
>        rtx avl = info.get_avl ();
> +      if (vlmax_avl_p (avl))
> +       return find_reg_killed_by (bb, get_vl (info.get_insn ()->rtl ()))
> +              != nullptr;
>        for (const insn_info *insn : bb->reverse_real_nondebug_insns ())
>         {
>           def_info *def = find_access (insn->defs (), REGNO (avl));
> @@ -229,18 +258,6 @@ has_vsetvl_killed_avl_p (const bb_info *bb, const
> vector_insn_info &info)
>    return false;
>  }
>
> -/* Helper function to get VL operand.  */
> -static rtx
> -get_vl (rtx_insn *rinsn)
> -{
> -  if (has_vl_op (rinsn))
> -    {
> -      extract_insn_cached (rinsn);
> -      return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> -    }
> -  return SET_DEST (XVECEXP (PATTERN (rinsn), 0, 0));
> -}
> -
>  /* An "anticipatable occurrence" is one that is the first occurrence in
> the
>     basic block, the operands are not modified in the basic block prior
>     to the occurrence and the output is not used between the start of
> @@ -419,30 +436,30 @@ backward_propagate_worthwhile_p (const basic_block
> cfg_bb,
>    return true;
>  }
>
> -/* Helper function to get AVL operand.  */
> -static rtx
> -get_avl (rtx_insn *rinsn)
> +static bool
> +insn_should_be_added_p (const insn_info *insn, unsigned int types)
>  {
> -  if (vsetvl_insn_p (rinsn))
> -    return XVECEXP (SET_SRC (XVECEXP (PATTERN (rinsn), 0, 0)), 0, 0);
> -
> -  if (!has_vl_op (rinsn))
> -    return NULL_RTX;
> -  if (get_attr_avl_type (rinsn) == VLMAX)
> -    return RVV_VLMAX;
> -  extract_insn_cached (rinsn);
> -  return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> +  if (insn->is_real () && (types & REAL_SET))
> +    return true;
> +  if (insn->is_phi () && (types & PHI_SET))
> +    return true;
> +  if (insn->is_bb_head () && (types & BB_HEAD_SET))
> +    return true;
> +  if (insn->is_bb_end () && (types & BB_END_SET))
> +    return true;
> +  return false;
>  }
>
> -/* Recursively find all real define instructions if it is a real
> instruction. */
> -static hash_set<insn_info *>
> -get_all_nonphi_defs (phi_info *phi)
> +/* Recursively find all define instructions. The kind of instruction is
> +   specified by the DEF_TYPE.  */
> +static hash_set<set_info *>
> +get_all_sets (phi_info *phi, unsigned int types)
>  {
> -  hash_set<insn_info *> insns;
> +  hash_set<set_info *> insns;
>    auto_vec<phi_info *> work_list;
>    hash_set<phi_info *> visited_list;
>    if (!phi)
> -    return insns;
> +    return hash_set<set_info *> ();
>    work_list.safe_push (phi);
>
>    while (!work_list.is_empty ())
> @@ -452,20 +469,17 @@ get_all_nonphi_defs (phi_info *phi)
>        for (use_info *use : phi->inputs ())
>         {
>           def_info *def = use->def ();
> -         if (!def)
> -           {
> -             /* if def is null, treat undefined */
> -             insns.empty ();
> -             return insns;
> -           }
> +         set_info *set = safe_dyn_cast<set_info *> (def);
> +         if (!set)
> +           return hash_set<set_info *> ();
>
> -         gcc_assert (!def->insn ()->is_debug_insn ());
> +         gcc_assert (!set->insn ()->is_debug_insn ());
>
> -         if (!def->insn ()->is_phi ())
> -           insns.add (def->insn ());
> -         if (def->insn ()->is_phi ())
> +         if (insn_should_be_added_p (set->insn (), types))
> +           insns.add (set);
> +         if (set->insn ()->is_phi ())
>             {
> -             phi_info *new_phi = as_a<phi_info *> (def);
> +             phi_info *new_phi = as_a<phi_info *> (set);
>               if (!visited_list.contains (new_phi))
>                 work_list.safe_push (new_phi);
>             }
> @@ -474,6 +488,47 @@ get_all_nonphi_defs (phi_info *phi)
>    return insns;
>  }
>
> +static hash_set<set_info *>
> +get_all_sets (set_info *set, bool /* get_real_inst */ real_p,
> +             bool /*get_phi*/ phi_p, bool /* get_function_parameter*/
> param_p)
> +{
> +  if (real_p && phi_p && param_p)
> +    return get_all_sets (safe_dyn_cast<phi_info *> (set),
> +                        REAL_SET | PHI_SET | BB_HEAD_SET | BB_END_SET);
> +
> +  else if (real_p && param_p)
> +    return get_all_sets (safe_dyn_cast<phi_info *> (set),
> +                        REAL_SET | BB_HEAD_SET | BB_END_SET);
> +
> +  else if (real_p)
> +    return get_all_sets (safe_dyn_cast<phi_info *> (set), REAL_SET);
> +  return hash_set<set_info *> ();
> +}
> +
> +/* Helper function to get AVL operand.  */
> +static rtx
> +get_avl (rtx_insn *rinsn)
> +{
> +  if (vsetvl_insn_p (rinsn) || vsetvl_discard_result_insn_p (rinsn))
> +    return XVECEXP (SET_SRC (XVECEXP (PATTERN (rinsn), 0, 0)), 0, 0);
> +
> +  if (!has_vl_op (rinsn))
> +    return NULL_RTX;
> +  if (get_attr_avl_type (rinsn) == VLMAX)
> +    return RVV_VLMAX;
> +  extract_insn_cached (rinsn);
> +  return recog_data.operand[get_attr_vl_op_idx (rinsn)];
> +}
> +
> +static set_info *
> +get_same_bb_set (hash_set<set_info *> &sets, const basic_block cfg_bb)
> +{
> +  for (set_info *set : sets)
> +    if (set->bb ()->cfg_bb () == cfg_bb)
> +      return set;
> +  return nullptr;
> +}
> +
>  /* Recursively find all predecessor blocks for cfg_bb. */
>  static hash_set<basic_block>
>  get_all_predecessors (basic_block cfg_bb)
> @@ -501,10 +556,10 @@ get_all_predecessors (basic_block cfg_bb)
>
>  /* Return true if there is an INSN in insns staying in the block BB.  */
>  static bool
> -any_insn_in_bb_p (hash_set<insn_info *> insns, const bb_info *bb)
> +any_set_in_bb_p (hash_set<set_info *> sets, const bb_info *bb)
>  {
> -  for (const insn_info *insn : insns)
> -    if (insn->bb ()->index () == bb->index ())
> +  for (const set_info *set : sets)
> +    if (set->bb ()->index () == bb->index ())
>        return true;
>    return false;
>  }
> @@ -834,10 +889,6 @@ insert_insn_end_basic_block (rtx_insn *rinsn,
> basic_block cfg_bb)
>  static vl_vtype_info
>  get_vl_vtype_info (const insn_info *insn)
>  {
> -  if (vector_config_insn_p (insn->rtl ()))
> -    gcc_assert (vsetvl_insn_p (insn->rtl ())
> -               && "Can't handle X0, rs1 vsetvli yet");
> -
>    set_info *set = nullptr;
>    rtx avl = ::get_avl (insn->rtl ());
>    if (avl && REG_P (avl) && !vlmax_avl_p (avl))
> @@ -942,8 +993,12 @@ change_vsetvl_insn (const insn_info *insn, const
> vector_insn_info &info)
>  }
>
>  static bool
> -source_equal_p (rtx_insn *rinsn1, rtx_insn *rinsn2)
> +source_equal_p (insn_info *insn1, insn_info *insn2)
>  {
> +  if (!insn1 || !insn2)
> +    return false;
> +  rtx_insn *rinsn1 = insn1->rtl ();
> +  rtx_insn *rinsn2 = insn2->rtl ();
>    if (!rinsn1 || !rinsn2)
>      return false;
>    rtx note1 = find_reg_equal_equiv_note (rinsn1);
> @@ -953,40 +1008,70 @@ source_equal_p (rtx_insn *rinsn1, rtx_insn *rinsn2)
>
>    if (note1 && note2 && rtx_equal_p (note1, note2))
>      return true;
> -  if (single_set1 && single_set2
> -      && rtx_equal_p (SET_SRC (single_set1), SET_SRC (single_set2)))
> -    return true;
> -  return false;
> +
> +  /* Since vsetvl instruction is not single SET.
> +     We handle this case specially here.  */
> +  if (vsetvl_insn_p (insn1->rtl ()) && vsetvl_insn_p (insn2->rtl ()))
> +    {
> +      /* For example:
> +          vsetvl1 a6,a5,e32m1
> +          RVV 1 (use a6 as AVL)
> +          vsetvl2 a5,a5,e8mf4
> +          RVV 2 (use a5 as AVL)
> +        We consider AVL of RVV 1 and RVV 2 are same so that we can
> +        gain more optimization opportunities.
> +
> +        Note: insn1_info.compatible_avl_p (insn2_info)
> +        will make sure there is no instruction between vsetvl1 and vsetvl2
> +        modify a5 since their def will be different if there is
> instruction
> +        modify a5 and compatible_avl_p will return false.  */
> +      vector_insn_info insn1_info, insn2_info;
> +      insn1_info.parse_insn (insn1);
> +      insn2_info.parse_insn (insn2);
> +      if (insn1_info.same_vlmax_p (insn2_info)
> +         && insn1_info.compatible_avl_p (insn2_info))
> +       return true;
> +    }
> +
> +  /* We only handle AVL is set by instructions with no side effects.  */
> +  if (!single_set1 || !single_set2)
> +    return false;
> +  if (!rtx_equal_p (SET_SRC (single_set1), SET_SRC (single_set2)))
> +    return false;
> +  gcc_assert (insn1->uses ().size () == insn2->uses ().size ());
> +  for (size_t i = 0; i < insn1->uses ().size (); i++)
> +    if (insn1->uses ()[i] != insn2->uses ()[i])
> +      return false;
> +  return true;
>  }
>
>  /* Helper function to get single same real RTL source.
>     return NULL if it is not a single real RTL source.  */
> -static rtx_insn *
> +static insn_info *
>  extract_single_source (set_info *set)
>  {
>    if (!set)
>      return nullptr;
>    if (set->insn ()->is_real ())
> -    return set->insn ()->rtl ();
> +    return set->insn ();
>    if (!set->insn ()->is_phi ())
>      return nullptr;
> -  phi_info *phi = safe_dyn_cast<phi_info *> (set);
> -  hash_set<insn_info *> insns = get_all_nonphi_defs (phi);
> +  hash_set<set_info *> sets = get_all_sets (set, true, false, true);
>
> -  insn_info *first_insn = (*insns.begin ());
> +  insn_info *first_insn = (*sets.begin ())->insn ();
>    if (first_insn->is_artificial ())
>      return nullptr;
> -  for (const insn_info *insn : insns)
> +  for (const set_info *set : sets)
>      {
>        /* If there is a head or end insn, we conservative return
>          NULL so that VSETVL PASS will insert vsetvl directly.  */
> -      if (insn->is_artificial ())
> +      if (set->insn ()->is_artificial ())
>         return nullptr;
> -      if (!source_equal_p (insn->rtl (), first_insn->rtl ()))
> +      if (!source_equal_p (set->insn (), first_insn))
>         return nullptr;
>      }
>
> -  return (*insns.begin ())->rtl ();
> +  return first_insn;
>  }
>
>  avl_info::avl_info (const avl_info &other)
> @@ -1004,9 +1089,82 @@ avl_info::single_source_equal_p (const avl_info
> &other) const
>  {
>    set_info *set1 = m_source;
>    set_info *set2 = other.get_source ();
> -  rtx_insn *rinsn1 = extract_single_source (set1);
> -  rtx_insn *rinsn2 = extract_single_source (set2);
> -  return source_equal_p (rinsn1, rinsn2);
> +  insn_info *insn1 = extract_single_source (set1);
> +  insn_info *insn2 = extract_single_source (set2);
> +  if (!insn1 || !insn2)
> +    return false;
> +  return source_equal_p (insn1, insn2);
> +}
> +
> +bool
> +avl_info::multiple_source_equal_p (const avl_info &other) const
> +{
> +  /* TODO: We don't do too much optimization here since it's
> +     too complicated in case of analyzing the PHI node.
> +
> +     For example:
> +       void f (void * restrict in, void * restrict out, int n, int m, int
> cond)
> +       {
> +         size_t vl;
> +         switch (cond)
> +         {
> +         case 1:
> +           vl = 100;
> +           break;
> +         case 2:
> +           vl = *(size_t*)(in + 100);
> +           break;
> +         case 3:
> +           {
> +             size_t new_vl = *(size_t*)(in + 500);
> +             size_t new_vl2 = *(size_t*)(in + 600);
> +             vl = new_vl + new_vl2 + 777;
> +             break;
> +           }
> +         default:
> +           vl = 4000;
> +           break;
> +         }
> +         for (size_t i = 0; i < n; i++)
> +           {
> +             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
> +             __riscv_vse8_v_i8mf8 (out + i, v, vl);
> +
> +             vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 100,
> vl);
> +             __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
> +           }
> +
> +         size_t vl2;
> +         switch (cond)
> +         {
> +         case 1:
> +           vl2 = 100;
> +           break;
> +         case 2:
> +           vl2 = *(size_t*)(in + 100);
> +           break;
> +         case 3:
> +           {
> +             size_t new_vl = *(size_t*)(in + 500);
> +             size_t new_vl2 = *(size_t*)(in + 600);
> +             vl2 = new_vl + new_vl2 + 777;
> +             break;
> +           }
> +         default:
> +           vl2 = 4000;
> +           break;
> +         }
> +         for (size_t i = 0; i < m; i++)
> +           {
> +             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl2);
> +             __riscv_vse8_v_i8mf8 (out + i + 300, v, vl2);
> +             vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 200,
> vl2);
> +             __riscv_vse8_v_i8mf8 (out + i + 200, v2, vl2);
> +           }
> +       }
> +     Such case may not be necessary to optimize since the codes of
> defining
> +     vl and vl2 are redundant.  */
> +  return m_source == other.get_source ();
>  }
>
>  avl_info &
> @@ -1025,11 +1183,6 @@ avl_info::operator== (const avl_info &other) const
>    if (!other.get_value ())
>      return false;
>
> -  /* It's safe to consider they are equal if their RTX value are
> -     strictly the same.  */
> -  if (m_value == other.get_value ())
> -    return true;
> -
>    if (GET_CODE (m_value) != GET_CODE (other.get_value ()))
>      return false;
>
> @@ -1041,10 +1194,6 @@ avl_info::operator== (const avl_info &other) const
>    if (vlmax_avl_p (m_value))
>      return vlmax_avl_p (other.get_value ());
>
> -  /* If Pseudo REGNO are same, it's safe to consider they are same.  */
> -  if (ORIGINAL_REGNO (m_value) == ORIGINAL_REGNO (other.get_value ()))
> -    return true;
> -
>    /* If any source is undef value, we think they are not equal.  */
>    if (!m_source || !other.get_source ())
>      return false;
> @@ -1054,9 +1203,7 @@ avl_info::operator== (const avl_info &other) const
>    if (single_source_equal_p (other))
>      return true;
>
> -  /* TODO: Support avl defined by PHI which includes multiple different
> insn
> -   * later.  */
> -  return false;
> +  return multiple_source_equal_p (other);
>  }
>
>  bool
> @@ -1078,7 +1225,7 @@ vl_vtype_info::vl_vtype_info (avl_info avl_in,
> uint8_t sew_in,
>  bool
>  vl_vtype_info::operator== (const vl_vtype_info &other) const
>  {
> -  return m_avl == other.get_avl_info () && m_sew == other.get_sew ()
> +  return same_avl_p (other) && m_sew == other.get_sew ()
>          && m_vlmul == other.get_vlmul () && m_ta == other.get_ta ()
>          && m_ma == other.get_ma () && m_ratio == other.get_ratio ();
>  }
> @@ -1102,7 +1249,12 @@ vl_vtype_info::has_non_zero_avl () const
>  bool
>  vl_vtype_info::same_avl_p (const vl_vtype_info &other) const
>  {
> -  return get_avl () == other.get_avl ();
> +  /* We need to compare both RTL and SET. If both AVL are CONST_INT.
> +     For example, const_int 3 and const_int 4, we need to compare
> +     RTL. If both AVL are REG and their REGNO are same, we need to
> +     compare SET.  */
> +  return get_avl () == other.get_avl ()
> +        && get_avl_source () == other.get_avl_source ();
>  }
>
>  bool
> @@ -1283,6 +1435,25 @@ vector_insn_info::parse_insn (insn_info *insn)
>      m_demands[DEMAND_TAIL_POLICY] = true;
>    if (get_attr_ma (insn->rtl ()) != INVALID_ATTRIBUTE)
>      m_demands[DEMAND_MASK_POLICY] = true;
> +
> +  if (vector_config_insn_p (insn->rtl ()))
> +    return;
> +
> +  if (!has_avl_reg () || !m_avl.get_source ()
> +      || !m_avl.get_source ()->insn ()->is_phi ())
> +    return;
> +
> +  insn_info *def_insn = extract_single_source (m_avl.get_source ());
> +  if (def_insn)
> +    {
> +      vector_insn_info new_info;
> +      new_info.parse_insn (def_insn);
> +      if (!same_vlmax_p (new_info))
> +       return;
> +      /* TODO: Currently, we don't forward AVL for non-VLMAX vsetvl.  */
> +      if (vlmax_avl_p (new_info.get_avl ()))
> +       set_avl_info (new_info.get_avl_info ());
> +    }
>  }
>
>  void
> @@ -1396,12 +1567,21 @@ vector_insn_info::compatible_p (const
> vl_vtype_info &curr_info) const
>    return compatible_avl_p (curr_info) && compatible_vtype_p (curr_info);
>  }
>
> +bool
> +vector_insn_info::available_p (const vector_insn_info &other) const
> +{
> +  if (*this >= other)
> +    return true;
> +  return false;
> +}
> +
>  vector_insn_info
>  vector_insn_info::merge (const vector_insn_info &merge_info,
>                          enum merge_type type = LOCAL_MERGE) const
>  {
> -  gcc_assert (this->compatible_p (merge_info)
> -             && "Can't merge incompatible demanded infos");
> +  if (!vsetvl_insn_p (get_insn ()->rtl ()))
> +    gcc_assert (this->compatible_p (merge_info)
> +               && "Can't merge incompatible demanded infos");
>
>    vector_insn_info new_info;
>    new_info.demand_vl_vtype ();
> @@ -1513,6 +1693,8 @@ vector_insn_info::dump (FILE *file) const
>      fprintf (file, "UNKNOWN,");
>    else if (empty_p ())
>      fprintf (file, "EMPTY,");
> +  else if (hard_empty_p ())
> +    fprintf (file, "HARD_EMPTY,");
>    else if (dirty_with_killed_avl_p ())
>      fprintf (file, "DIRTY_WITH_KILLED_AVL,");
>    else
> @@ -1606,7 +1788,7 @@ vector_infos_manager::get_all_available_exprs (
>  {
>    auto_vec<size_t> available_list;
>    for (size_t i = 0; i < vector_exprs.length (); i++)
> -    if (info >= *vector_exprs[i])
> +    if (info.available_p (*vector_exprs[i]))
>        available_list.safe_push (i);
>    return available_list;
>  }
> @@ -1862,14 +2044,16 @@ private:
>    /* Phase 3.  */
>    enum fusion_type get_backward_fusion_type (const bb_info *,
>                                              const vector_insn_info &);
> +  bool hard_empty_block_p (const bb_info *, const vector_insn_info &)
> const;
>    bool backward_demand_fusion (void);
>    bool forward_demand_fusion (void);
> +  bool cleanup_illegal_dirty_blocks (void);
>    void demand_fusion (void);
>
>    /* Phase 4.  */
>    void prune_expressions (void);
>    void compute_local_properties (void);
> -  bool can_refine_vsetvl_p (const basic_block, uint8_t) const;
> +  bool can_refine_vsetvl_p (const basic_block, const vector_insn_info &)
> const;
>    void refine_vsetvls (void) const;
>    void cleanup_vsetvls (void);
>    bool commit_vsetvls (void);
> @@ -1878,6 +2062,9 @@ private:
>    /* Phase 5.  */
>    void cleanup_insns (void) const;
>
> +  /* Phase 6.  */
> +  void propagate_avl (void) const;
> +
>    void init (void);
>    void done (void);
>    void compute_probabilities (void);
> @@ -2079,134 +2266,241 @@ pass_vsetvl::get_backward_fusion_type (const
> bb_info *bb,
>
>    gcc_assert (reg);
>    def_info *def = find_access (insn->uses (), REGNO (reg))->def ();
> -  if (def->insn ()->is_phi ())
> +  if (!def->insn ()->is_phi () && def->insn ()->bb () == insn->bb ())
> +    return INVALID_FUSION;
> +  hash_set<set_info *> sets
> +    = get_all_sets (prop.get_avl_source (), true, true, true);
> +  if (any_set_in_bb_p (sets, insn->bb ()))
> +    return INVALID_FUSION;
> +
> +  if (vlmax_avl_p (prop.get_avl ()))
>      {
> -      hash_set<insn_info *> insns
> -       = get_all_nonphi_defs (as_a<phi_info *> (def));
> -      if (any_insn_in_bb_p (insns, insn->bb ()))
> +      if (find_reg_killed_by (bb, reg))
>         return INVALID_FUSION;
> +      else
> +       return VALID_AVL_FUSION;
>      }
> -  else
> -    {
> -      if (def->insn ()->bb () == insn->bb ())
> -       return INVALID_FUSION;
> +
> +  /* By default, we always enable backward fusion so that we can
> +     gain more optimizations.  */
> +  if (!find_reg_killed_by (bb, reg))
> +    return VALID_AVL_FUSION;
> +  return KILLED_AVL_FUSION;
> +}
> +
> +/* We almost enable all cases in get_backward_fusion_type, this function
> +   disable the backward fusion by changing dirty blocks into hard empty
> +   blocks in forward dataflow. We can have more accurate optimization by
> +   this method.  */
> +bool
> +pass_vsetvl::hard_empty_block_p (const bb_info *bb,
> +                                const vector_insn_info &info) const
> +{
> +  if (!info.dirty_p () || !info.has_avl_reg ())
> +    return false;
> +
> +  basic_block cfg_bb = bb->cfg_bb ();
> +  sbitmap avin = m_vector_manager->vector_avin[cfg_bb->index];
> +  rtx avl = vlmax_avl_p (info.get_avl ()) ? get_vl (info.get_insn ()->rtl
> ())
> +                                         : get_avl (info.get_insn ()->rtl
> ());
> +  insn_info *insn = info.get_insn ();
> +  set_info *set = find_access (insn->uses (), REGNO (avl))->def ();
> +  hash_set<set_info *> sets = get_all_sets (set, true, false, false);
> +  hash_set<basic_block> pred_cfg_bbs = get_all_predecessors (cfg_bb);
> +
> +  if (find_reg_killed_by (bb, avl))
> +    {
> +      /* Condition 1:
> +        Dirty block with killed AVL means that the empty block (no RVV
> +        instructions) are polluted as Dirty blocks with the value of
> current
> +        AVL is killed. For example:
> +             bb 0:
> +               ...
> +             bb 1:
> +               def a5
> +             bb 2:
> +               RVV (use a5)
> +        In backward dataflow, we will polluted BB0 and BB1 as Dirt with
> AVL
> +        killed. since a5 is killed in BB1.
> +        In this case, let's take a look at this example:
> +
> +             bb 3:        bb 4:
> +               def3 a5       def4 a5
> +             bb 5:        bb 6:
> +               def1 a5       def2 a5
> +                   \         /
> +                    \       /
> +                     \     /
> +                      \   /
> +                       bb 7:
> +                   RVV (use a5)
> +        In thi case, we can polluted BB5 and BB6 as dirty if get-def
> +        of a5 from RVV instruction in BB7 is the def1 in BB5 and
> +        def2 BB6 so we can return false early here for HARD_EMPTY_BLOCK_P.
> +        However, we are not sure whether BB3 and BB4 can be
> +        polluted as Dirty with AVL killed so we can't return false
> +        for HARD_EMPTY_BLOCK_P here since it's too early which will
> +        potentially produce issues.  */
> +      gcc_assert (info.dirty_with_killed_avl_p ());
> +      if (info.get_avl_source ()
> +         && get_same_bb_set (sets, bb->cfg_bb ()) == info.get_avl_source
> ())
> +       return false;
>      }
>
> -  rtx new_reg = gen_rtx_REG (GET_MODE (reg), REGNO (reg));
> -  gcc_assert (new_reg != reg);
> -  const avl_info info = avl_info (new_reg, safe_dyn_cast<set_info *>
> (def));
> -  if (prop.dirty_with_killed_avl_p ())
> +  /* Condition 2:
> +     Suppress the VL/VTYPE info backward propagation too early:
> +                        ________
> +                       |   BB0  |
> +                       |________|
> +                           |
> +                       ____|____
> +                       |   BB1  |
> +                       |________|
> +     In this case, suppose BB 1 has multiple predecessors, BB 0 is one
> +     of them. BB1 has VL/VTYPE info (may be VALID or DIRTY) to backward
> +     propagate.
> +     The AVIN (available in) which is calculated by LCM is empty only
> +     in these 2 circumstances:
> +       1. all predecessors of BB1 are empty (not VALID
> +         and can not be polluted in backward fusion flow)
> +       2. VL/VTYPE info of BB1 predecessors are conflict.
> +
> +     We keep it as dirty in 2nd circumstance and set it as HARD_EMPTY
> +     (can not be polluted as DIRTY any more) in 1st circumstance.
> +     We don't backward propagate in 1st circumstance since there is
> +     no VALID RVV instruction and no polluted blocks (dirty blocks)
> +     by backward propagation from other following blocks.
> +     It's meaningless to keep it as Dirty anymore.
> +
> +     However, since we keep it as dirty in 2nd since there are VALID or
> +     Dirty blocks in predecessors, we can still gain the benefits and
> +     optimization opportunities. For example, in this case:
> +       for (size_t i = 0; i < n; i++)
> +        {
> +          if (i != cond) {
> +            vint8mf8_t v = *(vint8mf8_t*)(in + i + 100);
> +            *(vint8mf8_t*)(out + i + 100) = v;
> +          } else {
> +            vbool1_t v = *(vbool1_t*)(in + i + 400);
> +            *(vbool1_t*)(out + i + 400) = v;
> +          }
> +        }
> +     VL/VTYPE in if-else are conflict which will produce empty AVIN LCM
> result
> +     but we can still keep dirty blocks if *(i != cond)* is very unlikely
> then
> +     we can preset vsetvl (VL/VTYPE) info from else (static propability
> model).
> +
> +     We don't want to backward propagate VL/VTYPE information too early
> +     which is not the optimal and may potentially produce issues.  */
> +  if (bitmap_empty_p (avin))
>      {
> -      unsigned int bb_index;
> -      sbitmap_iterator sbi;
> -      sbitmap bitdata = m_vector_manager->vector_avout[bb->index ()];
> -      bool has_valid_avl = false;
> -      EXECUTE_IF_SET_IN_BITMAP (bitdata, 0, bb_index, sbi)
> -      {
> -       const vector_insn_info *expr =
> m_vector_manager->vector_exprs[bb_index];
> -       if (expr->compatible_avl_p (info))
> -         {
> -           has_valid_avl = true;
> -           break;
> -         }
> -      }
> -      if (!has_valid_avl)
> -       return INVALID_FUSION;
> +      bool hard_empty_p = true;
> +      for (const basic_block pred_cfg_bb : pred_cfg_bbs)
> +       {
> +         if (pred_cfg_bb == ENTRY_BLOCK_PTR_FOR_FN (cfun))
> +           continue;
> +         sbitmap avout =
> m_vector_manager->vector_avout[pred_cfg_bb->index];
> +         if (!bitmap_empty_p (avout))
> +           {
> +             hard_empty_p = false;
> +             break;
> +           }
> +       }
> +      if (hard_empty_p)
> +       return true;
>      }
>
> -  if (reg_killed_by_bb_p (bb, reg))
> +  edge e;
> +  edge_iterator ei;
> +  bool has_avl_killed_insn_p = false;
> +  FOR_EACH_EDGE (e, ei, cfg_bb->succs)
>      {
> -      unsigned int bb_index;
> -      sbitmap_iterator sbi;
> -      sbitmap bitdata = m_vector_manager->vector_avin[bb->index ()];
> -      hash_set<basic_block> blocks = get_all_predecessors (bb->cfg_bb ());
> -      for (const auto block : blocks)
> -       if (block == insn->bb ()->cfg_bb ())
> -         return INVALID_FUSION;
> -      if (bitmap_empty_p (bitdata))
> +      const auto block_info
> +       = m_vector_manager->vector_block_infos[e->dest->index];
> +      if (block_info.local_dem.dirty_with_killed_avl_p ())
>         {
> -         /* void f (int8_t *restrict in, int8_t *restrict out, int n, int
> m,
> -                   unsigned cond, size_t vl)
> -         {
> -           vbool64_t mask = *(vbool64_t *) (in + 1000000);
> -
> -           vl = 101;
> -           if (cond > 0)
> -             {
> -             vint8mf8_t v = __riscv_vle8_v_i8mf8 (in, vl);
> -             __riscv_vse8_v_i8mf8 (out, v, vl);
> -             }
> -           else
> -             {
> -             out[100] = out[100] + 300;
> -             }
> -
> -           for (size_t i = 0; i < n; i++)
> -             {
> -             vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((in + i + 200),
> vl);
> -             __riscv_vse32_v_f32mf2 ((out + i + 200), v, vl);
> +         has_avl_killed_insn_p = true;
> +         break;
> +       }
> +    }
> +  if (!has_avl_killed_insn_p)
> +    return false;
>
> -             vfloat32mf2_t v2
> -               = __riscv_vle32_v_f32mf2_tumu (mask, v, (in + i + 300),
> vl);
> -             __riscv_vse32_v_f32mf2_m (mask, (out + i + 300), v2, vl);
> -             }
> -         }  */
> -         for (const auto block : blocks)
> +  bool any_set_in_bbs_p = false;
> +  for (const basic_block pred_cfg_bb : pred_cfg_bbs)
> +    {
> +      insn_info *def_insn = extract_single_source (set);
> +      if (def_insn)
> +       {
> +         /* Condition 3:
> +
> +           Case 1:                               Case 2:
> +               bb 0:                                 bb 0:
> +                 def a5 101                             ...
> +               bb 1:                                 bb 1:
> +                 ...                                    ...
> +               bb 2:                                 bb 2:
> +                 RVV 1 (use a5 with TAIL ANY)           ...
> +               bb 3:                                 bb 3:
> +                 def a5 101                             def a5 101
> +               bb 4:                                 bb 4:
> +                 ...                                    ...
> +               bb 5:                                 bb 5:
> +                 RVV 2 (use a5 with TU)                 RVV 1 (use a5)
> +
> +           Case 1: We can pollute BB3,BB2,BB1,BB0 are all Dirt blocks
> +           with killed AVL so that we can merge TU demand info from RVV 2
> +           into RVV 1 and elide the vsevl instruction in BB5.
> +
> +           TODO: We only optimize for single source def since multiple
> source
> +           def is quite complicated.
> +
> +           Case 2: We only can pollute bb 3 as dirty and it has been
> accepted
> +           in Condition 2 and we can't pollute BB3,BB2,BB1,BB0 like case
> 1. */
> +         insn_info *last_killed_insn
> +           = find_reg_killed_by (crtl->ssa->bb (pred_cfg_bb), avl);
> +         if (!last_killed_insn || pred_cfg_bb == def_insn->bb ()->cfg_bb
> ())
> +           continue;
> +         if (source_equal_p (last_killed_insn, def_insn))
>             {
> -             if (block == ENTRY_BLOCK_PTR_FOR_FN (cfun))
> -               continue;
> -             sbitmap avout = m_vector_manager->vector_avout[block->index];
> -             EXECUTE_IF_SET_IN_BITMAP (avout, 0, bb_index, sbi)
> -             {
> -               const vector_insn_info *expr
> -                 = m_vector_manager->vector_exprs[bb_index];
> -               if (expr->compatible_avl_p (info))
> -                 return KILLED_AVL_FUSION;
> -             }
> +             any_set_in_bbs_p = true;
> +             break;
>             }
> -         return INVALID_FUSION;
>         }
>        else
>         {
> -         /* void f (int8_t * restrict in, int8_t * restrict out, int n,
> int
> -             m, unsigned cond, size_t vl)
> -             {
> -               vbool64_t mask = *(vbool64_t *) (in + 1000000);
> -
> -               vl = 101;
> -               if (cond > 0)
> -                 {
> -                 vint8mf8_t v = __riscv_vle8_v_i8mf8 (in, vl);
> -                 __riscv_vse8_v_i8mf8 (out, v, vl);
> -                 }
> -               else
> -                 {
> -                 vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + 1000, vl);
> -                 __riscv_vse8_v_i8mf8 (out + 1000, v, vl);
> -                 }
> -
> -               for (size_t i = 0; i < n; i++)
> -                 {
> -                 vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((in + i +
> 200), vl);
> -                 __riscv_vse32_v_f32mf2 ((out + i + 200), v, vl);
> -
> -                 vfloat32mf2_t v2
> -                   = __riscv_vle32_v_f32mf2_tumu (mask, v, (in + i +
> 300), vl);
> -                 __riscv_vse32_v_f32mf2_m (mask, (out + i + 300), v2, vl);
> -                 }
> -         }  */
> -         EXECUTE_IF_SET_IN_BITMAP (bitdata, 0, bb_index, sbi)
> -         {
> -           const vector_insn_info *expr
> -             = m_vector_manager->vector_exprs[bb_index];
> -           if (expr->compatible_avl_p (info))
> -             return KILLED_AVL_FUSION;
> -         }
> +         /* Condition 4:
> +
> +             bb 0:        bb 1:         bb 3:
> +               def1 a5       def2 a5     ...
> +                   \         /            /
> +                    \       /            /
> +                     \     /            /
> +                      \   /            /
> +                       bb 4:          /
> +                        |            /
> +                        |           /
> +                       bb 5:       /
> +                        |         /
> +                        |        /
> +                       bb 6:    /
> +                        |      /
> +                        |     /
> +                         bb 8:
> +                       RVV 1 (use a5)
> +         If we get-def (REAL) of a5 from RVV 1 instruction, we will get
> +         def1 from BB0 and def2 from BB1. So we will pollute BB6,BB5,BB4,
> +         BB0,BB1 with DIRTY and set BB3 as HARD_EMPTY so that we won't
> +         propagate AVL to BB3.  */
> +         if (any_set_in_bb_p (sets, crtl->ssa->bb (pred_cfg_bb)))
> +           {
> +             any_set_in_bbs_p = true;
> +             break;
> +           }
>         }
> -      return INVALID_FUSION;
>      }
> -
> -  return prop.dirty_with_killed_avl_p () ? KILLED_AVL_FUSION :
> VALID_AVL_FUSION;
> +  if (!any_set_in_bbs_p)
> +    return true;
> +  return false;
>  }
>
>  /* Compute global backward demanded info.  */
> @@ -2272,6 +2566,8 @@ pass_vsetvl::backward_demand_fusion (void)
>
>           if (block_info.reaching_out.unknown_p ())
>             continue;
> +         else if (block_info.reaching_out.hard_empty_p ())
> +           continue;
>           else if (block_info.reaching_out.empty_p ())
>             {
>               enum fusion_type type
> @@ -2281,6 +2577,17 @@ pass_vsetvl::backward_demand_fusion (void)
>
>               block_info.reaching_out = prop;
>               block_info.reaching_out.set_dirty (type);
> +
> +             if (prop.has_avl_reg () && !vlmax_avl_p (prop.get_avl ()))
> +               {
> +                 hash_set<set_info *> sets
> +                   = get_all_sets (prop.get_avl_source (), true, true,
> true);
> +                 set_info *set = get_same_bb_set (sets, e->src);
> +                 if (set)
> +                   block_info.reaching_out.set_avl_info (
> +                     avl_info (prop.get_avl (), set));
> +               }
> +
>               block_info.local_dem = block_info.reaching_out;
>               block_info.probability = curr_block_info.probability;
>               changed_p = true;
> @@ -2294,22 +2601,28 @@ pass_vsetvl::backward_demand_fusion (void)
>                 {
>                   if (block_info.reaching_out >= prop)
>                     continue;
> -                 block_info.probability += curr_block_info.probability;
>                   new_info = block_info.reaching_out.merge (prop,
> GLOBAL_MERGE);
> +                 new_info.set_dirty (
> +                   block_info.reaching_out.dirty_with_killed_avl_p ());
> +                 block_info.probability += curr_block_info.probability;
>                 }
>               else
>                 {
>                   if (curr_block_info.probability > block_info.probability)
>                     {
> +                     enum fusion_type type
> +                       = get_backward_fusion_type (crtl->ssa->bb (e->src),
> +                                                   prop);
> +                     if (type == INVALID_FUSION)
> +                       continue;
>                       new_info = prop;
> +                     new_info.set_dirty (type);
>                       block_info.probability = curr_block_info.probability;
>                     }
>                   else
>                     continue;
>                 }
>
> -             new_info.set_dirty (
> -               block_info.reaching_out.dirty_with_killed_avl_p ());
>               block_info.local_dem = new_info;
>               block_info.reaching_out = new_info;
>               changed_p = true;
> @@ -2319,10 +2632,28 @@ pass_vsetvl::backward_demand_fusion (void)
>               /* We not only change the info during backward propagation,
>                  but also change the VSETVL instruction.  */
>               gcc_assert (block_info.reaching_out.valid_p ());
> -             if (!block_info.reaching_out.compatible_p (prop))
> -               continue;
> -             if (block_info.reaching_out >= prop)
> -               continue;
> +             hash_set<set_info *> sets
> +               = get_all_sets (prop.get_avl_source (), true, false,
> false);
> +             set_info *set = get_same_bb_set (sets, e->src);
> +             if (vsetvl_insn_p (block_info.reaching_out.get_insn ()->rtl
> ())
> +                 && prop.has_avl_reg () && !vlmax_avl_p (prop.get_avl ()))
> +               {
> +                 if (!block_info.reaching_out.same_vlmax_p (prop))
> +                   continue;
> +                 if (block_info.reaching_out.same_vtype_p (prop))
> +                   continue;
> +                 if (!set)
> +                   continue;
> +                 if (set->insn () != block_info.reaching_out.get_insn ())
> +                   continue;
> +               }
> +             else
> +               {
> +                 if (!block_info.reaching_out.compatible_p (prop))
> +                   continue;
> +                 if (block_info.reaching_out >= prop)
> +                   continue;
> +               }
>
>               vector_insn_info be_merged = block_info.reaching_out;
>               if (block_info.local_dem == block_info.reaching_out)
> @@ -2410,8 +2741,8 @@ pass_vsetvl::forward_demand_fusion (void)
>           if (local_dem.dirty_p ())
>             {
>               gcc_assert (local_dem == reaching_out);
> +             new_info.set_dirty (local_dem.dirty_with_killed_avl_p ());
>               local_dem = new_info;
> -             local_dem.set_dirty (local_dem.dirty_with_killed_avl_p ());
>               reaching_out = local_dem;
>             }
>           else
> @@ -2439,9 +2770,6 @@ pass_vsetvl::demand_fusion (void)
>    while (changed_p)
>      {
>        changed_p = false;
> -      prune_expressions ();
> -      m_vector_manager->create_bitmap_vectors ();
> -      compute_local_properties ();
>        /* To optimize the case like this:
>          void f2 (int8_t * restrict in, int8_t * restrict out, int n, int
> cond)
>            {
> @@ -2475,12 +2803,22 @@ pass_vsetvl::demand_fusion (void)
>         an AVL kill instruction in bb 2 that we can't backward fuse bb 3 or
>         forward bb 1 arbitrarily. We need available information of each
> block to
>         help for such cases.  */
> +      changed_p |= backward_demand_fusion ();
> +      changed_p |= forward_demand_fusion ();
> +    }
> +
> +  changed_p = true;
> +  while (changed_p)
> +    {
> +      changed_p = false;
> +      prune_expressions ();
> +      m_vector_manager->create_bitmap_vectors ();
> +      compute_local_properties ();
>        compute_available (m_vector_manager->vector_comp,
>                          m_vector_manager->vector_kill,
>                          m_vector_manager->vector_avout,
>                          m_vector_manager->vector_avin);
> -      changed_p |= backward_demand_fusion ();
> -      changed_p |= forward_demand_fusion ();
> +      changed_p |= cleanup_illegal_dirty_blocks ();
>        m_vector_manager->free_bitmap_vectors ();
>        if (!m_vector_manager->vector_exprs.is_empty ())
>         m_vector_manager->vector_exprs.release ();
> @@ -2498,6 +2836,34 @@ pass_vsetvl::demand_fusion (void)
>      }
>  }
>
> +/* Cleanup illegal dirty blocks.  */
> +bool
> +pass_vsetvl::cleanup_illegal_dirty_blocks (void)
> +{
> +  bool changed_p = false;
> +  for (const bb_info *bb : crtl->ssa->bbs ())
> +    {
> +      basic_block cfg_bb = bb->cfg_bb ();
> +      const auto &prop
> +       = m_vector_manager->vector_block_infos[cfg_bb->index].reaching_out;
> +
> +      /* If there is nothing to cleanup, just skip it.  */
> +      if (!prop.valid_or_dirty_p ())
> +       continue;
> +
> +      if (hard_empty_block_p (bb, prop))
> +       {
> +         m_vector_manager->vector_block_infos[cfg_bb->index].local_dem
> +           = vector_insn_info::get_hard_empty ();
> +         m_vector_manager->vector_block_infos[cfg_bb->index].reaching_out
> +           = vector_insn_info::get_hard_empty ();
> +         changed_p = true;
> +         continue;
> +       }
> +    }
> +  return changed_p;
> +}
> +
>  /* Assemble the candidates expressions for LCM.  */
>  void
>  pass_vsetvl::prune_expressions (void)
> @@ -2614,18 +2980,20 @@ pass_vsetvl::compute_local_properties (void)
>              as long as the all real def insn of avl do not come from this
>              block. This configuration may be still missing some
> optimization
>              opportunities.  */
> -         if (reg_killed_by_bb_p (bb, expr->get_avl ()))
> +         if (find_reg_killed_by (bb, expr->get_avl ()))
>             {
> -             hash_set<insn_info *> insns = get_all_nonphi_defs (
> -               safe_dyn_cast<phi_info *> (expr->get_avl_source ()));
> -             if (any_insn_in_bb_p (insns, bb))
> +             hash_set<set_info *> sets
> +               = get_all_sets (expr->get_avl_source (), true, false,
> false);
> +             if (any_set_in_bb_p (sets, bb))
>                 bitmap_clear_bit
> (m_vector_manager->vector_transp[curr_bb_idx],
>                                   i);
>             }
>         }
>
>        /* Compute anticipatable occurrences.  */
> -      if (local_dem.valid_p () || local_dem.real_dirty_p ())
> +      if (local_dem.valid_p () || local_dem.real_dirty_p ()
> +         || (has_vsetvl_killed_avl_p (bb, local_dem)
> +             && vlmax_avl_p (local_dem.get_avl ())))
>         if (anticipatable_occurrence_p (bb, local_dem))
>           bitmap_set_bit (m_vector_manager->vector_antic[curr_bb_idx],
>                           m_vector_manager->get_expr_id (local_dem));
> @@ -2693,7 +3061,8 @@ pass_vsetvl::compute_local_properties (void)
>
>  /* Return true if VSETVL in the block can be refined as vsetvl
> zero,zero.  */
>  bool
> -pass_vsetvl::can_refine_vsetvl_p (const basic_block cfg_bb, uint8_t
> ratio) const
> +pass_vsetvl::can_refine_vsetvl_p (const basic_block cfg_bb,
> +                                 const vector_insn_info &info) const
>  {
>    if (!m_vector_manager->all_same_ratio_p (
>         m_vector_manager->vector_avin[cfg_bb->index]))
> @@ -2705,7 +3074,9 @@ pass_vsetvl::can_refine_vsetvl_p (const basic_block
> cfg_bb, uint8_t ratio) const
>
>    size_t expr_id
>      = bitmap_first_set_bit (m_vector_manager->vector_avin[cfg_bb->index]);
> -  if (m_vector_manager->vector_exprs[expr_id]->get_ratio () != ratio)
> +  if (!m_vector_manager->vector_exprs[expr_id]->same_vlmax_p (info))
> +    return false;
> +  if (!m_vector_manager->vector_exprs[expr_id]->compatible_avl_p (info))
>      return false;
>
>    edge e;
> @@ -2748,7 +3119,7 @@ pass_vsetvl::refine_vsetvls (void) const
>         continue;
>
>        rtx_insn *rinsn = insn->rtl ();
> -      if (!can_refine_vsetvl_p (cfg_bb, info.get_ratio ()))
> +      if (!can_refine_vsetvl_p (cfg_bb, info))
>         continue;
>
>        if (!vector_config_insn_p (rinsn))
> @@ -2871,7 +3242,7 @@ pass_vsetvl::commit_vsetvls (void)
>         }
>
>        rtx new_pat;
> -      if (can_refine_vsetvl_p (cfg_bb, reaching_out.get_ratio ()))
> +      if (can_refine_vsetvl_p (cfg_bb, reaching_out))
>         new_pat
>           = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, reaching_out,
> NULL_RTX);
>        else if (vlmax_avl_p (reaching_out.get_avl ()))
> @@ -2965,6 +3336,115 @@ pass_vsetvl::cleanup_insns (void) const
>      }
>  }
>
> +void
> +pass_vsetvl::propagate_avl (void) const
> +{
> +  /* Rebuild the RTL_SSA according to the new CFG generated by LCM.  */
> +  /* Finalization of RTL_SSA.  */
> +  free_dominance_info (CDI_DOMINATORS);
> +  if (crtl->ssa->perform_pending_updates ())
> +    cleanup_cfg (0);
> +  delete crtl->ssa;
> +  crtl->ssa = nullptr;
> +  /* Initialization of RTL_SSA.  */
> +  calculate_dominance_info (CDI_DOMINATORS);
> +  df_analyze ();
> +  crtl->ssa = new function_info (cfun);
> +
> +  hash_set<rtx_insn *> to_delete;
> +  for (const bb_info *bb : crtl->ssa->bbs ())
> +    {
> +      for (insn_info *insn : bb->real_nondebug_insns ())
> +       {
> +         if (vsetvl_discard_result_insn_p (insn->rtl ()))
> +           {
> +             rtx avl = get_avl (insn->rtl ());
> +             if (!REG_P (avl))
> +               continue;
> +
> +             set_info *set = find_access (insn->uses (), REGNO
> (avl))->def ();
> +             insn_info *def_insn = extract_single_source (set);
> +             if (!def_insn)
> +               continue;
> +
> +             /* Handle this case:
> +                vsetvli        a6,zero,e32,m1,ta,mu
> +                li     a5,4096
> +                add    a7,a0,a5
> +                addi   a7,a7,-96
> +                vsetvli        t1,zero,e8,mf8,ta,ma
> +                vle8.v v24,0(a7)
> +                add    a5,a3,a5
> +                addi   a5,a5,-96
> +                vse8.v v24,0(a5)
> +                vsetvli        zero,a6,e32,m1,tu,ma
> +             */
> +             if (vsetvl_insn_p (def_insn->rtl ()))
> +               {
> +                 vl_vtype_info def_info = get_vl_vtype_info (def_insn);
> +                 vl_vtype_info info = get_vl_vtype_info (insn);
> +                 rtx avl = get_avl (def_insn->rtl ());
> +                 rtx vl = get_vl (def_insn->rtl ());
> +                 if (def_info.get_ratio () == info.get_ratio ())
> +                   {
> +                     if (vlmax_avl_p (def_info.get_avl ()))
> +                       {
> +                         info.set_avl_info (
> +                           avl_info (def_info.get_avl (), nullptr));
> +                         rtx new_pat
> +                           = gen_vsetvl_pat (VSETVL_NORMAL, info, vl);
> +                         validate_change (insn->rtl (),
> +                                          &PATTERN (insn->rtl ()),
> new_pat,
> +                                          false);
> +                         continue;
> +                       }
> +                     if (def_info.has_avl_imm () || rtx_equal_p (avl, vl))
> +                       {
> +                         info.set_avl_info (avl_info (avl, nullptr));
> +                         emit_vsetvl_insn (VSETVL_DISCARD_RESULT,
> EMIT_AFTER,
> +                                           info, NULL_RTX, insn->rtl ());
> +                         if (set->single_nondebug_insn_use ())
> +                           {
> +                             to_delete.add (insn->rtl ());
> +                             to_delete.add (def_insn->rtl ());
> +                           }
> +                         continue;
> +                       }
> +                   }
> +               }
> +           }
> +
> +         /* Change vsetvl rd, rs1 --> vsevl zero, rs1,
> +            if rd is not used by any nondebug instructions.
> +            Even though this PASS runs after RA and it doesn't help for
> +            reduce register pressure, it can help instructions scheduling
> +            since we remove the dependencies.  */
> +         if (vsetvl_insn_p (insn->rtl ()))
> +           {
> +             rtx vl = get_vl (insn->rtl ());
> +             rtx avl = get_avl (insn->rtl ());
> +             if (vlmax_avl_p (avl))
> +               continue;
> +             def_info *def = find_access (insn->defs (), REGNO (vl));
> +             set_info *set = safe_dyn_cast<set_info *> (def);
> +             gcc_assert (set);
> +             const vl_vtype_info info = get_vl_vtype_info (insn);
> +             rtx new_pat
> +               = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, info, NULL_RTX);
> +             if (!set->has_nondebug_insn_uses ())
> +               {
> +                 validate_change (insn->rtl (), &PATTERN (insn->rtl ()),
> +                                  new_pat, false);
> +                 continue;
> +               }
> +           }
> +       }
> +    }
> +
> +  for (rtx_insn *rinsn : to_delete)
> +    eliminate_insn (rinsn);
> +}
> +
>  void
>  pass_vsetvl::init (void)
>  {
> @@ -3083,6 +3563,12 @@ pass_vsetvl::lazy_vsetvl (void)
>    if (dump_file)
>      fprintf (dump_file, "\nPhase 5: Cleanup AVL and VL operands\n");
>    cleanup_insns ();
> +
> +  /* Phase 6 - Rebuild RTL_SSA to propagate AVL between vsetvls.  */
> +  if (dump_file)
> +    fprintf (dump_file,
> +            "\nPhase 6: Rebuild RTL_SSA to propagate AVL between
> vsetvls\n");
> +  propagate_avl ();
>  }
>
>  /* Main entry point for this pass.  */
> diff --git a/gcc/config/riscv/riscv-vsetvl.h
> b/gcc/config/riscv/riscv-vsetvl.h
> index 3b68bf638ae..4177b3e851b 100644
> --- a/gcc/config/riscv/riscv-vsetvl.h
> +++ b/gcc/config/riscv/riscv-vsetvl.h
> @@ -65,6 +65,21 @@ enum merge_type
>    GLOBAL_MERGE
>  };
>
> +enum def_type
> +{
> +  REAL_SET = 1 << 0,
> +  PHI_SET = 1 << 1,
> +  BB_HEAD_SET = 1 << 2,
> +  BB_END_SET = 1 << 3,
> +  /* ??? TODO: In RTL_SSA framework, we have REAL_SET,
> +     PHI_SET, BB_HEAD_SET, BB_END_SET and
> +     CLOBBER_DEF def_info types. Currently,
> +     we conservatively do not optimize clobber
> +     def since we don't see the case that we
> +     need to optimize it.  */
> +  CLOBBER_DEF = 1 << 4
> +};
> +
>  /* AVL info for RVV instruction. Most RVV instructions have AVL operand in
>     implicit dependency. The AVL comparison between 2 RVV instructions is
>     very important since it affects our decision whether we should insert
> @@ -143,6 +158,7 @@ public:
>    rtx get_value () const { return m_value; }
>    rtl_ssa::set_info *get_source () const { return m_source; }
>    bool single_source_equal_p (const avl_info &) const;
> +  bool multiple_source_equal_p (const avl_info &) const;
>    avl_info &operator= (const avl_info &);
>    bool operator== (const avl_info &) const;
>    bool operator!= (const avl_info &) const;
> @@ -210,6 +226,8 @@ private:
>      VALID,
>      UNKNOWN,
>      EMPTY,
> +    /* The empty block can not be polluted as dirty.  */
> +    HARD_EMPTY,
>
>      /* The block is polluted as containing VSETVL instruction during dem
>         backward propagation to gain better LCM optimization even though
> @@ -280,7 +298,8 @@ public:
>    bool uninit_p () const { return m_state == UNINITIALIZED; }
>    bool valid_p () const { return m_state == VALID; }
>    bool unknown_p () const { return m_state == UNKNOWN; }
> -  bool empty_p () const { return m_state == EMPTY; }
> +  bool empty_p () const { return m_state == EMPTY || m_state ==
> HARD_EMPTY; }
> +  bool hard_empty_p () const { return m_state == HARD_EMPTY; }
>    bool dirty_p () const
>    {
>      return m_state == DIRTY || m_state == DIRTY_WITH_KILLED_AVL;
> @@ -295,6 +314,7 @@ public:
>      return m_state == VALID || m_state == DIRTY
>            || m_state == DIRTY_WITH_KILLED_AVL;
>    }
> +  bool available_p (const vector_insn_info &) const;
>
>    static vector_insn_info get_unknown ()
>    {
> @@ -303,9 +323,17 @@ public:
>      return info;
>    }
>
> +  static vector_insn_info get_hard_empty ()
> +  {
> +    vector_insn_info info;
> +    info.set_hard_empty ();
> +    return info;
> +  }
> +
>    void set_valid () { m_state = VALID; }
>    void set_unknown () { m_state = UNKNOWN; }
>    void set_empty () { m_state = EMPTY; }
> +  void set_hard_empty () { m_state = HARD_EMPTY; }
>    void set_dirty (enum fusion_type type)
>    {
>      gcc_assert (type == VALID_AVL_FUSION || type == KILLED_AVL_FUSION);
> --
> 2.36.3
>
>

     prev parent reply	other threads:[~2023-01-27 12:31 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-18  3:24 juzhe.zhong
2023-01-27 12:31 ` Kito Cheng [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CA+yXCZCs4p7uN2O=Hk+=Am0Ra3dvEDk7vN0zb6VxKNe5uEPx8Q@mail.gmail.com' \
    --to=kito.cheng@gmail.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=juzhe.zhong@rivai.ai \
    --cc=palmer@dabbelt.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).