[PATCH V2] VECT: Support floating-point in-order reduction for length loop control

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
@ 2023-07-21 10:05 juzhe.zhong
  2023-07-21 10:51 ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: juzhe.zhong @ 2023-07-21 10:05 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

This patch support floating-point in-order reduction for loop length control.

Consider this following case:

float foo (float *__restrict a, int n)
{
  float result = 1.0;
  for (int i = 0; i < n; i++)
   result += a[i];
  return result;
}

When compile with **NO** -ffast-math on ARM SVE, we will end up with:

loop_mask = WHILE_ULT
result = MASK_FOLD_LEFT_PLUS (...loop_mask...)

For RVV, we don't use length loop control instead of mask:

So, with this patch, we expect to see:

loop_len = SELECT_VL
result = MASK_LEN_FOLD_LEFT_PLUS (...loop_len...)

gcc/ChangeLog:

	* tree-vect-loop.cc (get_masked_reduction_fn): Add mask_len_fold_left.
	(vectorize_fold_left_reduction): Ditto.
	(vectorizable_reduction): Ditto.
	(vect_transform_reduction): Ditto.

---
 gcc/tree-vect-loop.cc | 41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b44fb9c7712..59ab7879d55 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6800,11 +6800,13 @@ static internal_fn
 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
 {
   internal_fn mask_reduc_fn;
+  internal_fn mask_len_reduc_fn;
 
   switch (reduc_fn)
     {
     case IFN_FOLD_LEFT_PLUS:
       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
+      mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
       break;
 
     default:
@@ -6814,6 +6816,9 @@ get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
 				      OPTIMIZE_FOR_SPEED))
     return mask_reduc_fn;
+  if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
+				      OPTIMIZE_FOR_SPEED))
+    return mask_len_reduc_fn;
   return IFN_LAST;
 }
 
@@ -6834,7 +6839,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       gimple *reduc_def_stmt,
 			       tree_code code, internal_fn reduc_fn,
 			       tree ops[3], tree vectype_in,
-			       int reduc_index, vec_loop_masks *masks)
+			       int reduc_index, vec_loop_masks *masks,
+			       vec_loop_lens *lens)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
@@ -6896,8 +6902,18 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
     {
       gimple *new_stmt;
       tree mask = NULL_TREE;
+      tree len = NULL_TREE;
+      tree bias = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	{
+	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
+				   i, 1);
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  bias = build_int_cst (intQI_type_node, biasval);
+	  mask = build_minus_one_cst (truth_type_for (vectype_in));
+	}
 
       /* Handle MINUS by adding the negative.  */
       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
@@ -6917,7 +6933,10 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	 the preceding operation.  */
       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
 	{
-	  if (mask && mask_reduc_fn != IFN_LAST)
+	  if (len && mask && mask_reduc_fn != IFN_LAST)
+	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
+						   def0, mask, len, bias);
+	  else if (mask && mask_reduc_fn != IFN_LAST)
 	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
 						   def0, mask);
 	  else
@@ -7979,6 +7998,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
     {
       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
 
       if (reduction_type != FOLD_LEFT_REDUCTION
@@ -8006,8 +8026,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	}
       else
-	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-			       vectype_in, NULL);
+	{
+	  internal_fn mask_reduc_fn
+	    = get_masked_reduction_fn (reduc_fn, vectype_in);
+
+	  if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
+	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
+				  vectype_in, 1);
+	  else
+	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+				   vectype_in, NULL);
+	}
     }
   return true;
 }
@@ -8137,6 +8166,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   code_helper code = canonicalize_code (op.code, op.type);
   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
 
   /* Transform.  */
@@ -8162,7 +8192,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code.is_tree_code ());
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
+	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
+	   lens);
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
-- 
2.36.3


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
  2023-07-21 10:05 [PATCH V2] VECT: Support floating-point in-order reduction for length loop control juzhe.zhong
@ 2023-07-21 10:51 ` Richard Biener
  2023-07-21 10:59   ` juzhe.zhong
  2023-07-21 11:08   ` juzhe.zhong
  0 siblings, 2 replies; 6+ messages in thread
From: Richard Biener @ 2023-07-21 10:51 UTC (permalink / raw)
  To: Ju-Zhe Zhong; +Cc: gcc-patches, richard.sandiford

On Fri, 21 Jul 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch support floating-point in-order reduction for loop length control.
> 
> Consider this following case:
> 
> float foo (float *__restrict a, int n)
> {
>   float result = 1.0;
>   for (int i = 0; i < n; i++)
>    result += a[i];
>   return result;
> }
> 
> When compile with **NO** -ffast-math on ARM SVE, we will end up with:
> 
> loop_mask = WHILE_ULT
> result = MASK_FOLD_LEFT_PLUS (...loop_mask...)
> 
> For RVV, we don't use length loop control instead of mask:
> 
> So, with this patch, we expect to see:
> 
> loop_len = SELECT_VL
> result = MASK_LEN_FOLD_LEFT_PLUS (...loop_len...)
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (get_masked_reduction_fn): Add mask_len_fold_left.
> 	(vectorize_fold_left_reduction): Ditto.
> 	(vectorizable_reduction): Ditto.
> 	(vect_transform_reduction): Ditto.
> 
> ---
>  gcc/tree-vect-loop.cc | 41 ++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 36 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index b44fb9c7712..59ab7879d55 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -6800,11 +6800,13 @@ static internal_fn
>  get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
>  {
>    internal_fn mask_reduc_fn;
> +  internal_fn mask_len_reduc_fn;
>  
>    switch (reduc_fn)
>      {
>      case IFN_FOLD_LEFT_PLUS:
>        mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
> +      mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
>        break;
>  
>      default:
> @@ -6814,6 +6816,9 @@ get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
>    if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
>  				      OPTIMIZE_FOR_SPEED))
>      return mask_reduc_fn;
> +  if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
> +				      OPTIMIZE_FOR_SPEED))
> +    return mask_len_reduc_fn;
>    return IFN_LAST;
>  }
>  
> @@ -6834,7 +6839,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple *reduc_def_stmt,
>  			       tree_code code, internal_fn reduc_fn,
>  			       tree ops[3], tree vectype_in,
> -			       int reduc_index, vec_loop_masks *masks)
> +			       int reduc_index, vec_loop_masks *masks,
> +			       vec_loop_lens *lens)
>  {
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
> @@ -6896,8 +6902,18 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>      {
>        gimple *new_stmt;
>        tree mask = NULL_TREE;
> +      tree len = NULL_TREE;
> +      tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	{
> +	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
> +				   i, 1);
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	  bias = build_int_cst (intQI_type_node, biasval);
> +	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	}
>  
>        /* Handle MINUS by adding the negative.  */
>        if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
> @@ -6917,7 +6933,10 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	 the preceding operation.  */
>        if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
>  	{
> -	  if (mask && mask_reduc_fn != IFN_LAST)
> +	  if (len && mask && mask_reduc_fn != IFN_LAST)

check mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS instead?

> +	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
> +						   def0, mask, len, bias);
> +	  else if (mask && mask_reduc_fn != IFN_LAST)

Likewise.

Otherwise looks good to me.

Richard.

>  	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
>  						   def0, mask);
>  	  else
> @@ -7979,6 +7998,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>      {
>        vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>        internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
>  
>        if (reduction_type != FOLD_LEFT_REDUCTION
> @@ -8006,8 +8026,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  	}
>        else
> -	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -			       vectype_in, NULL);
> +	{
> +	  internal_fn mask_reduc_fn
> +	    = get_masked_reduction_fn (reduc_fn, vectype_in);
> +
> +	  if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
> +	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
> +				  vectype_in, 1);
> +	  else
> +	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +				   vectype_in, NULL);
> +	}
>      }
>    return true;
>  }
> @@ -8137,6 +8166,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
>  
>    /* Transform.  */
> @@ -8162,7 +8192,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (code.is_tree_code ());
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
> +	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> +	   lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Re: [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
  2023-07-21 10:51 ` Richard Biener
@ 2023-07-21 10:59   ` juzhe.zhong
  2023-07-21 11:08   ` juzhe.zhong
  1 sibling, 0 replies; 6+ messages in thread
From: juzhe.zhong @ 2023-07-21 10:59 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 6711 bytes --]

Thanks Richi,

Address comment on V3:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625167.html 

Bootstrap and regression is on the way.



juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-07-21 18:51
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
On Fri, 21 Jul 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch support floating-point in-order reduction for loop length control.
> 
> Consider this following case:
> 
> float foo (float *__restrict a, int n)
> {
>   float result = 1.0;
>   for (int i = 0; i < n; i++)
>    result += a[i];
>   return result;
> }
> 
> When compile with **NO** -ffast-math on ARM SVE, we will end up with:
> 
> loop_mask = WHILE_ULT
> result = MASK_FOLD_LEFT_PLUS (...loop_mask...)
> 
> For RVV, we don't use length loop control instead of mask:
> 
> So, with this patch, we expect to see:
> 
> loop_len = SELECT_VL
> result = MASK_LEN_FOLD_LEFT_PLUS (...loop_len...)
> 
> gcc/ChangeLog:
> 
> * tree-vect-loop.cc (get_masked_reduction_fn): Add mask_len_fold_left.
> (vectorize_fold_left_reduction): Ditto.
> (vectorizable_reduction): Ditto.
> (vect_transform_reduction): Ditto.
> 
> ---
>  gcc/tree-vect-loop.cc | 41 ++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 36 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index b44fb9c7712..59ab7879d55 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -6800,11 +6800,13 @@ static internal_fn
>  get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
>  {
>    internal_fn mask_reduc_fn;
> +  internal_fn mask_len_reduc_fn;
>  
>    switch (reduc_fn)
>      {
>      case IFN_FOLD_LEFT_PLUS:
>        mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
> +      mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
>        break;
>  
>      default:
> @@ -6814,6 +6816,9 @@ get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
>    if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
>        OPTIMIZE_FOR_SPEED))
>      return mask_reduc_fn;
> +  if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
> +       OPTIMIZE_FOR_SPEED))
> +    return mask_len_reduc_fn;
>    return IFN_LAST;
>  }
>  
> @@ -6834,7 +6839,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>         gimple *reduc_def_stmt,
>         tree_code code, internal_fn reduc_fn,
>         tree ops[3], tree vectype_in,
> -        int reduc_index, vec_loop_masks *masks)
> +        int reduc_index, vec_loop_masks *masks,
> +        vec_loop_lens *lens)
>  {
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
> @@ -6896,8 +6902,18 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>      {
>        gimple *new_stmt;
>        tree mask = NULL_TREE;
> +      tree len = NULL_TREE;
> +      tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
> +    i, 1);
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   bias = build_int_cst (intQI_type_node, biasval);
> +   mask = build_minus_one_cst (truth_type_for (vectype_in));
> + }
>  
>        /* Handle MINUS by adding the negative.  */
>        if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
> @@ -6917,7 +6933,10 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  the preceding operation.  */
>        if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
>  {
> -   if (mask && mask_reduc_fn != IFN_LAST)
> +   if (len && mask && mask_reduc_fn != IFN_LAST)
 
check mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS instead?
 
> +     new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
> +    def0, mask, len, bias);
> +   else if (mask && mask_reduc_fn != IFN_LAST)
 
Likewise.
 
Otherwise looks good to me.
 
Richard.
 
>      new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
>     def0, mask);
>    else
> @@ -7979,6 +7998,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>      {
>        vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>        internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
>  
>        if (reduction_type != FOLD_LEFT_REDUCTION
> @@ -8006,8 +8026,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  }
>        else
> - vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -        vectype_in, NULL);
> + {
> +   internal_fn mask_reduc_fn
> +     = get_masked_reduction_fn (reduc_fn, vectype_in);
> +
> +   if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
> +   vectype_in, 1);
> +   else
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype_in, NULL);
> + }
>      }
>    return true;
>  }
> @@ -8137,6 +8166,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
>  
>    /* Transform.  */
> @@ -8162,7 +8192,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (code.is_tree_code ());
>        return vectorize_fold_left_reduction
>    (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -    tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
> +    tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> +    lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Re: [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
  2023-07-21 10:51 ` Richard Biener
  2023-07-21 10:59   ` juzhe.zhong
@ 2023-07-21 11:08   ` juzhe.zhong
  2023-07-23  4:32     ` Lehua Ding
  1 sibling, 1 reply; 6+ messages in thread
From: juzhe.zhong @ 2023-07-21 11:08 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 7350 bytes --]

Oh. Sorry for missing a fix, Now I fix as you suggested on V4
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625169.html 

Change it as follows:

          if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
            new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
                                                   def0, mask, len, bias);
          else if (mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
            new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
                                                   def0, mask);
          else
            new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
                                                   def0);

Sorry for that.

Bootstrap && Regression on running.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-07-21 18:51
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
On Fri, 21 Jul 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch support floating-point in-order reduction for loop length control.
> 
> Consider this following case:
> 
> float foo (float *__restrict a, int n)
> {
>   float result = 1.0;
>   for (int i = 0; i < n; i++)
>    result += a[i];
>   return result;
> }
> 
> When compile with **NO** -ffast-math on ARM SVE, we will end up with:
> 
> loop_mask = WHILE_ULT
> result = MASK_FOLD_LEFT_PLUS (...loop_mask...)
> 
> For RVV, we don't use length loop control instead of mask:
> 
> So, with this patch, we expect to see:
> 
> loop_len = SELECT_VL
> result = MASK_LEN_FOLD_LEFT_PLUS (...loop_len...)
> 
> gcc/ChangeLog:
> 
> * tree-vect-loop.cc (get_masked_reduction_fn): Add mask_len_fold_left.
> (vectorize_fold_left_reduction): Ditto.
> (vectorizable_reduction): Ditto.
> (vect_transform_reduction): Ditto.
> 
> ---
>  gcc/tree-vect-loop.cc | 41 ++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 36 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index b44fb9c7712..59ab7879d55 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -6800,11 +6800,13 @@ static internal_fn
>  get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
>  {
>    internal_fn mask_reduc_fn;
> +  internal_fn mask_len_reduc_fn;
>  
>    switch (reduc_fn)
>      {
>      case IFN_FOLD_LEFT_PLUS:
>        mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
> +      mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
>        break;
>  
>      default:
> @@ -6814,6 +6816,9 @@ get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
>    if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
>        OPTIMIZE_FOR_SPEED))
>      return mask_reduc_fn;
> +  if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
> +       OPTIMIZE_FOR_SPEED))
> +    return mask_len_reduc_fn;
>    return IFN_LAST;
>  }
>  
> @@ -6834,7 +6839,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>         gimple *reduc_def_stmt,
>         tree_code code, internal_fn reduc_fn,
>         tree ops[3], tree vectype_in,
> -        int reduc_index, vec_loop_masks *masks)
> +        int reduc_index, vec_loop_masks *masks,
> +        vec_loop_lens *lens)
>  {
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
> @@ -6896,8 +6902,18 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>      {
>        gimple *new_stmt;
>        tree mask = NULL_TREE;
> +      tree len = NULL_TREE;
> +      tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
> +    i, 1);
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   bias = build_int_cst (intQI_type_node, biasval);
> +   mask = build_minus_one_cst (truth_type_for (vectype_in));
> + }
>  
>        /* Handle MINUS by adding the negative.  */
>        if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
> @@ -6917,7 +6933,10 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  the preceding operation.  */
>        if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
>  {
> -   if (mask && mask_reduc_fn != IFN_LAST)
> +   if (len && mask && mask_reduc_fn != IFN_LAST)
 
check mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS instead?
 
> +     new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
> +    def0, mask, len, bias);
> +   else if (mask && mask_reduc_fn != IFN_LAST)
 
Likewise.
 
Otherwise looks good to me.
 
Richard.
 
>      new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
>     def0, mask);
>    else
> @@ -7979,6 +7998,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>      {
>        vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>        internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
>  
>        if (reduction_type != FOLD_LEFT_REDUCTION
> @@ -8006,8 +8026,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  }
>        else
> - vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -        vectype_in, NULL);
> + {
> +   internal_fn mask_reduc_fn
> +     = get_masked_reduction_fn (reduc_fn, vectype_in);
> +
> +   if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
> +   vectype_in, 1);
> +   else
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype_in, NULL);
> + }
>      }
>    return true;
>  }
> @@ -8137,6 +8166,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
>  
>    /* Transform.  */
> @@ -8162,7 +8192,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (code.is_tree_code ());
>        return vectorize_fold_left_reduction
>    (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -    tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
> +    tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> +    lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Re: [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
  2023-07-21 11:08   ` juzhe.zhong
@ 2023-07-23  4:32     ` Lehua Ding
  2023-07-24  6:44       ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Lehua Ding @ 2023-07-23  4:32 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches, richard.sandiford, juzhe.zhong

[-- Attachment #1: Type: text/plain, Size: 230 bytes --]

Hi Richard,


Bootstrap and regression are passed on X86 and
no new testcases fail on AArch64 with V5 patch:


https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625293.html


V5 patch is ok for trunk?


Best,
Lehua

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Re: [PATCH V2] VECT: Support floating-point in-order reduction for length loop control
  2023-07-23  4:32     ` Lehua Ding
@ 2023-07-24  6:44       ` Richard Biener
  0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2023-07-24  6:44 UTC (permalink / raw)
  To: Lehua Ding; +Cc: gcc-patches, richard.sandiford, juzhe.zhong

On Sun, 23 Jul 2023, Lehua Ding wrote:

> Hi Richard,
> 
> 
> Bootstrap and regression are passed on X86 and
> no new testcases fail on AArch64 with V5 patch:
> 
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625293.html
> 
> 
> V5 patch is ok for trunk?

Yes.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-07-24  6:44 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-21 10:05 [PATCH V2] VECT: Support floating-point in-order reduction for length loop control juzhe.zhong
2023-07-21 10:51 ` Richard Biener
2023-07-21 10:59   ` juzhe.zhong
2023-07-21 11:08   ` juzhe.zhong
2023-07-23  4:32     ` Lehua Ding
2023-07-24  6:44       ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).