public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
@ 2023-08-11 14:24 juzhe.zhong
  2023-08-14  6:53 ` Richard Biener
  0 siblings, 1 reply; 19+ messages in thread
From: juzhe.zhong @ 2023-08-11 14:24 UTC (permalink / raw)
  To: gcc-patches; +Cc: rguenther, richard.sandiford, Ju-Zhe Zhong

From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

This patch add support live vectorization by VEC_EXTRACT for LEN loop control.

Consider this following case:

#include <stdint.h>

#define EXTRACT_LAST(TYPE)			\
  TYPE __attribute__ ((noinline, noclone))	\
  test_##TYPE (TYPE *x, int n, TYPE value)	\
  {						\
    TYPE last;					\
    for (int j = 0; j < n; ++j)			\
      {						\
	last = x[j];				\
	x[j] = last * value;			\
      }						\
    return last;				\
  }

#define TEST_ALL(T)				\
  T (uint8_t)					\

TEST_ALL (EXTRACT_LAST)

ARM SVE IR:

Preheader:
  max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });

Loop:
  ...
  # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
  ...
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
  ...
  next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
  ...

Epilogue:
  _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);

For RVV since we prefer len in loop control, after this patch for RVV:

Loop:
  ...
  loop_len_22 = SELECT_VL;
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
  ...

Epilogue:
  _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);

Details of this approach:

1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
            for LEN loop control.
   
   This function we check whether target support:
    - Use LEN as the loop control.
    - Support VEC_EXTRACT optab.

2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.

3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).

The only difference between mask and len is that len is using length generated by SELECT_VL and
use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.

---
 gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bf8d677b584..a011e2dacb2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       /* No transformation required.  */
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
-	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-					       OPTIMIZE_FOR_SPEED))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't operate on partial vectors "
-				 "because the target doesn't support extract "
-				 "last reduction.\n");
-	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	    }
-	  else if (slp_node)
+	  if (slp_node)
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  else
 	    {
 	      gcc_assert (ncopies == 1 && !slp_node);
-	      vect_record_loop_mask (loop_vinfo,
-				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype, NULL);
+	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+						  OPTIMIZE_FOR_SPEED))
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       1, vectype, NULL);
+	      else if (convert_optab_handler (vec_extract_optab,
+					      TYPE_MODE (vectype),
+					      TYPE_MODE (TREE_TYPE (vectype)))
+		       != CODE_FOR_nothing)
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo),
+				      1, vectype, 1);
+	      else
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (
+		      MSG_MISSED_OPTIMIZATION, vect_location,
+		      "can't operate on partial vectors "
+		      "because the target doesn't support extract "
+		      "last reduction.\n");
+		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		}
 	    }
 	}
       /* ???  Enable for loop costing as well.  */
@@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 
       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	{
+	  /* Emit:
+
+	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+	     where VEC_LHS is the vectorized live-out result and MASK is
+	     the loop mask for the final iteration.  */
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  gimple_seq tem = NULL;
+	  gimple_stmt_iterator gsi = gsi_last (tem);
+	  tree len
+	    = vect_get_loop_len (loop_vinfo, &gsi,
+				 &LOOP_VINFO_LENS (loop_vinfo),
+				 1, vectype, 0, 0);
+
+	  /* BIAS - 1.  */
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias_minus_one
+	    = int_const_binop (MINUS_EXPR,
+			       build_int_cst (TREE_TYPE (len), biasval),
+			       build_one_cst (TREE_TYPE (len)));
+
+	  /* LAST_INDEX = LEN + (BIAS - 1).  */
+	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+					  len, bias_minus_one);
+
+	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+	  tree scalar_res
+	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+			    vec_lhs_phi, last_index);
+
+	  /* Convert the extracted vector element to the scalar type.  */
+	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+	}
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	{
 	  /* Emit:
 
-- 
2.36.3


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-11 14:24 [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization juzhe.zhong
@ 2023-08-14  6:53 ` Richard Biener
  2023-08-14  7:09   ` juzhe.zhong
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Biener @ 2023-08-14  6:53 UTC (permalink / raw)
  To: Ju-Zhe Zhong; +Cc: gcc-patches, richard.sandiford

On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.

OK.

Thanks,
Richard.

> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE)			\
>   TYPE __attribute__ ((noinline, noclone))	\
>   test_##TYPE (TYPE *x, int n, TYPE value)	\
>   {						\
>     TYPE last;					\
>     for (int j = 0; j < n; ++j)			\
>       {						\
> 	last = x[j];				\
> 	x[j] = last * value;			\
>       }						\
>     return last;				\
>   }
> 
> #define TEST_ALL(T)				\
>   T (uint8_t)					\
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>             for LEN loop control.
>    
>    This function we check whether target support:
>     - Use LEN as the loop control.
>     - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> 
> The only difference between mask and len is that len is using length generated by SELECT_VL and
> use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.
> 
> ---
>  gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
>  1 file changed, 62 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bf8d677b584..a011e2dacb2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  	{
> -	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -					       OPTIMIZE_FOR_SPEED))
> -	    {
> -	      if (dump_enabled_p ())
> -		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -				 "can't operate on partial vectors "
> -				 "because the target doesn't support extract "
> -				 "last reduction.\n");
> -	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -	    }
> -	  else if (slp_node)
> +	  if (slp_node)
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	  else
>  	    {
>  	      gcc_assert (ncopies == 1 && !slp_node);
> -	      vect_record_loop_mask (loop_vinfo,
> -				     &LOOP_VINFO_MASKS (loop_vinfo),
> -				     1, vectype, NULL);
> +	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +						  OPTIMIZE_FOR_SPEED))
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo),
> +				       1, vectype, NULL);
> +	      else if (convert_optab_handler (vec_extract_optab,
> +					      TYPE_MODE (vectype),
> +					      TYPE_MODE (TREE_TYPE (vectype)))
> +		       != CODE_FOR_nothing)
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo),
> +				      1, vectype, 1);
> +	      else
> +		{
> +		  if (dump_enabled_p ())
> +		    dump_printf_loc (
> +		      MSG_MISSED_OPTIMIZATION, vect_location,
> +		      "can't operate on partial vectors "
> +		      "because the target doesn't support extract "
> +		      "last reduction.\n");
> +		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +		}
>  	    }
>  	}
>        /* ???  Enable for loop costing as well.  */
> @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	{
> +	  /* Emit:
> +
> +	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +	     where VEC_LHS is the vectorized live-out result and MASK is
> +	     the loop mask for the final iteration.  */
> +	  gcc_assert (ncopies == 1 && !slp_node);
> +	  gimple_seq tem = NULL;
> +	  gimple_stmt_iterator gsi = gsi_last (tem);
> +	  tree len
> +	    = vect_get_loop_len (loop_vinfo, &gsi,
> +				 &LOOP_VINFO_LENS (loop_vinfo),
> +				 1, vectype, 0, 0);
> +
> +	  /* BIAS - 1.  */
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	  tree bias_minus_one
> +	    = int_const_binop (MINUS_EXPR,
> +			       build_int_cst (TREE_TYPE (len), biasval),
> +			       build_one_cst (TREE_TYPE (len)));
> +
> +	  /* LAST_INDEX = LEN + (BIAS - 1).  */
> +	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +					  len, bias_minus_one);
> +
> +	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +	  tree scalar_res
> +	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +			    vec_lhs_phi, last_index);
> +
> +	  /* Convert the extracted vector element to the scalar type.  */
> +	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +	}
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	{
>  	  /* Emit:
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  6:53 ` Richard Biener
@ 2023-08-14  7:09   ` juzhe.zhong
  2023-08-14  8:51     ` Kewen.Lin
  0 siblings, 1 reply; 19+ messages in thread
From: juzhe.zhong @ 2023-08-14  7:09 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches, richard.sandiford, linkw

[-- Attachment #1: Type: text/plain, Size: 7263 bytes --]

Thanks Richi.

CC kewen to see whether this patch is suitable for powerpc and s390.



juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-14 14:53
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
 
OK.
 
Thanks,
Richard.
 
> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE) \
>   TYPE __attribute__ ((noinline, noclone)) \
>   test_##TYPE (TYPE *x, int n, TYPE value) \
>   { \
>     TYPE last; \
>     for (int j = 0; j < n; ++j) \
>       { \
> last = x[j]; \
> x[j] = last * value; \
>       } \
>     return last; \
>   }
> 
> #define TEST_ALL(T) \
>   T (uint8_t) \
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>             for LEN loop control.
>    
>    This function we check whether target support:
>     - Use LEN as the loop control.
>     - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> 
> The only difference between mask and len is that len is using length generated by SELECT_VL and
> use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> 
> gcc/ChangeLog:
> 
> * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.
> 
> ---
>  gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
>  1 file changed, 62 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bf8d677b584..a011e2dacb2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  {
> -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -        OPTIMIZE_FOR_SPEED))
> -     {
> -       if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -     }
> -   else if (slp_node)
> +   if (slp_node)
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    else
>      {
>        gcc_assert (ncopies == 1 && !slp_node);
> -       vect_record_loop_mask (loop_vinfo,
> -      &LOOP_VINFO_MASKS (loop_vinfo),
> -      1, vectype, NULL);
> +       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +   OPTIMIZE_FOR_SPEED))
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo),
> +        1, vectype, NULL);
> +       else if (convert_optab_handler (vec_extract_optab,
> +       TYPE_MODE (vectype),
> +       TYPE_MODE (TREE_TYPE (vectype)))
> +        != CODE_FOR_nothing)
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo),
> +       1, vectype, 1);
> +       else
> + {
> +   if (dump_enabled_p ())
> +     dump_printf_loc (
> +       MSG_MISSED_OPTIMIZATION, vect_location,
> +       "can't operate on partial vectors "
> +       "because the target doesn't support extract "
> +       "last reduction.\n");
> +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
>      }
>  }
>        /* ???  Enable for loop costing as well.  */
> @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   /* Emit:
> +
> +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +   gcc_assert (ncopies == 1 && !slp_node);
> +   gimple_seq tem = NULL;
> +   gimple_stmt_iterator gsi = gsi_last (tem);
> +   tree len
> +     = vect_get_loop_len (loop_vinfo, &gsi,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> +   /* BIAS - 1.  */
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias_minus_one
> +     = int_const_binop (MINUS_EXPR,
> +        build_int_cst (TREE_TYPE (len), biasval),
> +        build_one_cst (TREE_TYPE (len)));
> +
> +   /* LAST_INDEX = LEN + (BIAS - 1).  */
> +   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +   len, bias_minus_one);
> +
> +   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +   tree scalar_res
> +     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +     vec_lhs_phi, last_index);
> +
> +   /* Convert the extracted vector element to the scalar type.  */
> +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> + }
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  {
>    /* Emit:
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  7:09   ` juzhe.zhong
@ 2023-08-14  8:51     ` Kewen.Lin
  2023-08-14  8:58       ` Robin Dapp
  0 siblings, 1 reply; 19+ messages in thread
From: Kewen.Lin @ 2023-08-14  8:51 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: gcc-patches, richard.sandiford, rguenther

Hi Juzhe,

on 2023/8/14 15:09, juzhe.zhong@rivai.ai wrote:
> Thanks Richi.
> 
> CC kewen to see whether this patch is suitable for powerpc and s390.

I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.

A short list looks like:

< FAIL: gcc.c-torture/compile/20150108.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn,
at internal-fn.cc:3164)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -g  (test for excess errors)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn,
at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -g  (test for excess errors)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at
internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/pr84321.c (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/pr84321.c (test for excess errors)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at
internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
....

> 
> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> juzhe.zhong@rivai.ai
> 
>      
>     *From:* Richard Biener <mailto:rguenther@suse.de>
>     *Date:* 2023-08-14 14:53
>     *To:* Ju-Zhe Zhong <mailto:juzhe.zhong@rivai.ai>
>     *CC:* gcc-patches <mailto:gcc-patches@gcc.gnu.org>; richard.sandiford <mailto:richard.sandiford@arm.com>
>     *Subject:* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
>     On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>      
>     > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>     >
>     > Hi, Richard and Richi.
>     >
>     > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
>      
>     OK.
>      
>     Thanks,
>     Richard.
>      
>     > Consider this following case:
>     >
>     > #include <stdint.h>
>     >
>     > #define EXTRACT_LAST(TYPE) \
>     >   TYPE __attribute__ ((noinline, noclone)) \
>     >   test_##TYPE (TYPE *x, int n, TYPE value) \
>     >   { \
>     >     TYPE last; \
>     >     for (int j = 0; j < n; ++j) \
>     >       { \
>     > last = x[j]; \
>     > x[j] = last * value; \
>     >       } \
>     >     return last; \
>     >   }
>     >
>     > #define TEST_ALL(T) \
>     >   T (uint8_t) \
>     >
>     > TEST_ALL (EXTRACT_LAST)
>     >
>     > ARM SVE IR:
>     >
>     > Preheader:
>     >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
>     >
>     > Loop:
>     >   ...
>     >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>     >   ...
>     >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>     >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>     >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>     >   ...
>     >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>     >   ...
>     >
>     > Epilogue:
>     >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
>     >
>     > For RVV since we prefer len in loop control, after this patch for RVV:
>     >
>     > Loop:
>     >   ...
>     >   loop_len_22 = SELECT_VL;
>     >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>     >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>     >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>     >   ...
>     >
>     > Epilogue:
>     >   _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
>     >
>     > Details of this approach:
>     >
>     > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>     >             for LEN loop control.
>     >   
>     >    This function we check whether target support:
>     >     - Use LEN as the loop control.
>     >     - Support VEC_EXTRACT optab.
>     >
>     > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
>     >
>     > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
>     >
>     > The only difference between mask and len is that len is using length generated by SELECT_VL and
>     > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
>     >
>     > gcc/ChangeLog:
>     >
>     > * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.
>     >
>     > ---
>     >  gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
>     >  1 file changed, 62 insertions(+), 16 deletions(-)
>     >
>     > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>     > index bf8d677b584..a011e2dacb2 100644
>     > --- a/gcc/tree-vect-loop.cc
>     > +++ b/gcc/tree-vect-loop.cc
>     > @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     >        /* No transformation required.  */
>     >        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>     >  {
>     > -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>     > -        OPTIMIZE_FOR_SPEED))
>     > -     {
>     > -       if (dump_enabled_p ())
>     > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>     > - "can't operate on partial vectors "
>     > - "because the target doesn't support extract "
>     > - "last reduction.\n");
>     > -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>     > -     }
>     > -   else if (slp_node)
>     > +   if (slp_node)
>     >      {
>     >        if (dump_enabled_p ())
>     >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>     > @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     >    else
>     >      {
>     >        gcc_assert (ncopies == 1 && !slp_node);
>     > -       vect_record_loop_mask (loop_vinfo,
>     > -      &LOOP_VINFO_MASKS (loop_vinfo),
>     > -      1, vectype, NULL);
>     > +       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>     > +   OPTIMIZE_FOR_SPEED))
>     > + vect_record_loop_mask (loop_vinfo,
>     > +        &LOOP_VINFO_MASKS (loop_vinfo),
>     > +        1, vectype, NULL);
>     > +       else if (convert_optab_handler (vec_extract_optab,
>     > +       TYPE_MODE (vectype),
>     > +       TYPE_MODE (TREE_TYPE (vectype)))
>     > +        != CODE_FOR_nothing)
>     > + vect_record_loop_len (loop_vinfo,
>     > +       &LOOP_VINFO_LENS (loop_vinfo),
>     > +       1, vectype, 1);
>     > +       else
>     > + {
>     > +   if (dump_enabled_p ())
>     > +     dump_printf_loc (
>     > +       MSG_MISSED_OPTIMIZATION, vect_location,
>     > +       "can't operate on partial vectors "
>     > +       "because the target doesn't support extract "
>     > +       "last reduction.\n");
>     > +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>     > + }
>     >      }
>     >  }
>     >        /* ???  Enable for loop costing as well.  */
>     > @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     >    gimple *vec_stmt;
>     >    if (slp_node)
>     >      {
>     > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
>     > +      gcc_assert (!loop_vinfo
>     > +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
>     > +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>     > 
>     >        /* Get the correct slp vectorized stmt.  */
>     >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
>     > @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     > 
>     >        gimple_seq stmts = NULL;
>     >        tree new_tree;
>     > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>     > +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>     > + {
>     > +   /* Emit:
>     > +
>     > +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
>     > +
>     > +      where VEC_LHS is the vectorized live-out result and MASK is
>     > +      the loop mask for the final iteration.  */
>     > +   gcc_assert (ncopies == 1 && !slp_node);
>     > +   gimple_seq tem = NULL;
>     > +   gimple_stmt_iterator gsi = gsi_last (tem);
>     > +   tree len
>     > +     = vect_get_loop_len (loop_vinfo, &gsi,
>     > + &LOOP_VINFO_LENS (loop_vinfo),
>     > + 1, vectype, 0, 0);
>     > +
>     > +   /* BIAS - 1.  */
>     > +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>     > +   tree bias_minus_one
>     > +     = int_const_binop (MINUS_EXPR,
>     > +        build_int_cst (TREE_TYPE (len), biasval),
>     > +        build_one_cst (TREE_TYPE (len)));
>     > +
>     > +   /* LAST_INDEX = LEN + (BIAS - 1).  */
>     > +   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
>     > +   len, bias_minus_one);
>     > +
>     > +   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
>     > +   tree scalar_res
>     > +     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
>     > +     vec_lhs_phi, last_index);
>     > +
>     > +   /* Convert the extracted vector element to the scalar type.  */
>     > +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
>     > + }
>     > +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>     >  {
>     >    /* Emit:
>     > 
>     >
>      
>     -- 
>     Richard Biener <rguenther@suse.de>
>     SUSE Software Solutions Germany GmbH,
>     Frankenstrasse 146, 90461 Nuernberg, Germany;
>     GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>      
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  8:51     ` Kewen.Lin
@ 2023-08-14  8:58       ` Robin Dapp
  2023-08-14  9:15         ` Richard Biener
  2023-08-14  9:19         ` Kewen.Lin
  0 siblings, 2 replies; 19+ messages in thread
From: Robin Dapp @ 2023-08-14  8:58 UTC (permalink / raw)
  To: Kewen.Lin, juzhe.zhong
  Cc: rdapp.gcc, gcc-patches, richard.sandiford, rguenther

Hi Kewen,

> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.

I think the problem is that just like for vec_set we're expecting
the vec_extract expander not to fail.  It is probably passed not a
const int here anymore and therefore fails to expand?

can_vec_extract_var_idx_p is supposed to check if the backend
supports extracting a variable index.

Regards
 Robin

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  8:58       ` Robin Dapp
@ 2023-08-14  9:15         ` Richard Biener
  2023-08-14  9:19         ` Kewen.Lin
  1 sibling, 0 replies; 19+ messages in thread
From: Richard Biener @ 2023-08-14  9:15 UTC (permalink / raw)
  To: Robin Dapp; +Cc: Kewen.Lin, juzhe.zhong, gcc-patches, richard.sandiford

On Mon, 14 Aug 2023, Robin Dapp wrote:

> Hi Kewen,
> 
> > I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.

expansion does

  enum insn_code icode = convert_optab_handler (optab, outermode,
                                                extract_mode);

  if (icode != CODE_FOR_nothing)
    {
      create_output_operand (&ops[0], target, extract_mode);
      create_input_operand (&ops[1], src, outermode);
      create_convert_operand_from (&ops[2], pos,
                                   TYPE_MODE (TREE_TYPE (op1)), true);
      if (maybe_expand_insn (icode, 3, ops))
        {
          if (!rtx_equal_p (target, ops[0].value))
            emit_move_insn (target, ops[0].value);
          return;
        }

<--- here

    }
  gcc_unreachable ();

so if maybe_expand_insn fails that would need to be sth we need
to cover in the predicate to check.  But that looks possibly
target dependent?  What does actually fail here?

Richard.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  8:58       ` Robin Dapp
  2023-08-14  9:15         ` Richard Biener
@ 2023-08-14  9:19         ` Kewen.Lin
  2023-08-14  9:26           ` juzhe.zhong
  2023-08-14 12:08           ` juzhe.zhong
  1 sibling, 2 replies; 19+ messages in thread
From: Kewen.Lin @ 2023-08-14  9:19 UTC (permalink / raw)
  To: Robin Dapp; +Cc: gcc-patches, richard.sandiford, rguenther, juzhe.zhong

Hi Robin,

on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?

Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:

(define_expand "vec_extract<mode><VEC_base_l>"
  [(match_operand:<VEC_base> 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})

> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.

OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
               else if (convert_optab_handler (vec_extract_optab,
                                               TYPE_MODE (vectype),
                                               TYPE_MODE (TREE_TYPE (vectype)))
-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
                 vect_record_loop_len (loop_vinfo,
                                       &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);

BR,
Kewen

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  9:19         ` Kewen.Lin
@ 2023-08-14  9:26           ` juzhe.zhong
  2023-08-14 12:07             ` Richard Biener
  2023-08-14 12:08           ` juzhe.zhong
  1 sibling, 1 reply; 19+ messages in thread
From: juzhe.zhong @ 2023-08-14  9:26 UTC (permalink / raw)
  To: linkw, Robin Dapp; +Cc: gcc-patches, richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 2703 bytes --]

-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))

I think maybe 'can_vec_extract_var_idx_p' check is enough and remove convert_optab_handler (vec_extract_optab,... check.
Looking forward Richi's more comments.

Thanks.


juzhe.zhong@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 17:19
To: Robin Dapp
CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
Hi Robin,
 
on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
 
Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:
 
(define_expand "vec_extract<mode><VEC_base_l>"
  [(match_operand:<VEC_base> 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})
 
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.
 
OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
               else if (convert_optab_handler (vec_extract_optab,
                                               TYPE_MODE (vectype),
                                               TYPE_MODE (TREE_TYPE (vectype)))
-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
                 vect_record_loop_len (loop_vinfo,
                                       &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);
 
BR,
Kewen
 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  9:26           ` juzhe.zhong
@ 2023-08-14 12:07             ` Richard Biener
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Biener @ 2023-08-14 12:07 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: linkw, Robin Dapp, gcc-patches, richard.sandiford

On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote:

> -                       != CODE_FOR_nothing)
> +                         != CODE_FOR_nothing
> +                       && can_vec_extract_var_idx_p (
> +                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> 
> I think maybe 'can_vec_extract_var_idx_p' check is enough and remove convert_optab_handler (vec_extract_optab,... check.
> Looking forward Richi's more comments.

Yes, I think can_vec_extract_var_idx_p already does that so no need to
duplicate it here.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Kewen.Lin
> Date: 2023-08-14 17:19
> To: Robin Dapp
> CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai
> Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
> Hi Robin,
>  
> on 2023/8/14 16:58, Robin Dapp wrote:
> > Hi Kewen,
> > 
> >> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> > 
> > I think the problem is that just like for vec_set we're expecting
> > the vec_extract expander not to fail.  It is probably passed not a
> > const int here anymore and therefore fails to expand?
>  
> Thanks for the comments!  Yeah, I think the expectation doesn't hold
> on Power, as our vec_extract optab only support const index, that
> is:
>  
> (define_expand "vec_extract<mode><VEC_base_l>"
>   [(match_operand:<VEC_base> 0 "register_operand")
>    (match_operand:VEC_E 1 "vlogical_operand")
>    (match_operand 2 "const_int_operand")]
>   "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
> {
>   rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
>   DONE;
> })
>  
> > 
> > can_vec_extract_var_idx_p is supposed to check if the backend
> > supports extracting a variable index.
>  
> OK, it sounds that this new capability needs to further check with
> function can_vec_extract_var_idx_p to ensure the ifn expanding work
> as expected.  I re-spined by adding the below as your comments:
>  
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 07f3717ed9d..80ba5cae84a 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>                else if (convert_optab_handler (vec_extract_optab,
>                                                TYPE_MODE (vectype),
>                                                TYPE_MODE (TREE_TYPE (vectype)))
> -                       != CODE_FOR_nothing)
> +                         != CODE_FOR_nothing
> +                       && can_vec_extract_var_idx_p (
> +                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
>                  vect_record_loop_len (loop_vinfo,
>                                        &LOOP_VINFO_LENS (loop_vinfo),
>                                        1, vectype, 1);
>  
> BR,
> Kewen
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14  9:19         ` Kewen.Lin
  2023-08-14  9:26           ` juzhe.zhong
@ 2023-08-14 12:08           ` juzhe.zhong
  2023-08-14 12:45             ` Kewen.Lin
  1 sibling, 1 reply; 19+ messages in thread
From: juzhe.zhong @ 2023-08-14 12:08 UTC (permalink / raw)
  To: linkw, Robin Dapp; +Cc: gcc-patches, richard.sandiford, rguenther

[-- Attachment #1: Type: text/plain, Size: 2411 bytes --]

Hi, Kewin.

Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?

Thanks.


juzhe.zhong@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 17:19
To: Robin Dapp
CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
Hi Robin,
 
on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
 
Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:
 
(define_expand "vec_extract<mode><VEC_base_l>"
  [(match_operand:<VEC_base> 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})
 
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.
 
OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
               else if (convert_optab_handler (vec_extract_optab,
                                               TYPE_MODE (vectype),
                                               TYPE_MODE (TREE_TYPE (vectype)))
-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
                 vect_record_loop_len (loop_vinfo,
                                       &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);
 
BR,
Kewen
 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14 12:08           ` juzhe.zhong
@ 2023-08-14 12:45             ` Kewen.Lin
  2023-08-14 13:28               ` 钟居哲
  2023-08-14 18:51               ` Stefan Schulze Frielinghaus
  0 siblings, 2 replies; 19+ messages in thread
From: Kewen.Lin @ 2023-08-14 12:45 UTC (permalink / raw)
  To: juzhe.zhong; +Cc: Robin Dapp, richard.sandiford, rguenther, GCC Patches

Hi Juzhe,

on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> Hi, Kewin.
> 
> Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?

The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
its required new includes as below:

+#include "memmodel.h"
+#include "optabs.h"
 
Could you have a double check?

Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
the v5 posting to you.  Thanks!

BR,
Kewen
-----
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bc3063c3615..5ae9f69c7eb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "ssa.h"
 #include "optabs-tree.h"
+#include "memmodel.h"
+#include "optabs.h"
 #include "diagnostic-core.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       /* No transformation required.  */
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
-	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-					       OPTIMIZE_FOR_SPEED))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't operate on partial vectors "
-				 "because the target doesn't support extract "
-				 "last reduction.\n");
-	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	    }
-	  else if (slp_node)
+	  if (slp_node)
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  else
 	    {
 	      gcc_assert (ncopies == 1 && !slp_node);
-	      vect_record_loop_mask (loop_vinfo,
-				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype, NULL);
+	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+						  OPTIMIZE_FOR_SPEED))
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       1, vectype, NULL);
+	      else if (can_vec_extract_var_idx_p (
+			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo),
+				      1, vectype, 1);
+	      else
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (
+		      MSG_MISSED_OPTIMIZATION, vect_location,
+		      "can't operate on partial vectors "
+		      "because the target doesn't support extract "
+		      "last reduction.\n");
+		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		}
 	    }
 	}
       /* ???  Enable for loop costing as well.  */
@@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));

       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,

       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	{
+	  /* Emit:
+
+	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+	     where VEC_LHS is the vectorized live-out result and MASK is
+	     the loop mask for the final iteration.  */
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  gimple_seq tem = NULL;
+	  gimple_stmt_iterator gsi = gsi_last (tem);
+	  tree len
+	    = vect_get_loop_len (loop_vinfo, &gsi,
+				 &LOOP_VINFO_LENS (loop_vinfo),
+				 1, vectype, 0, 0);
+
+	  /* BIAS - 1.  */
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias_minus_one
+	    = int_const_binop (MINUS_EXPR,
+			       build_int_cst (TREE_TYPE (len), biasval),
+			       build_one_cst (TREE_TYPE (len)));
+
+	  /* LAST_INDEX = LEN + (BIAS - 1).  */
+	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+					  len, bias_minus_one);
+
+	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+	  tree scalar_res
+	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+			    vec_lhs_phi, last_index);
+
+	  /* Convert the extracted vector element to the scalar type.  */
+	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+	}
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	{
 	  /* Emit:

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14 12:45             ` Kewen.Lin
@ 2023-08-14 13:28               ` 钟居哲
  2023-08-14 13:44                 ` Richard Biener
  2023-08-14 18:51               ` Stefan Schulze Frielinghaus
  1 sibling, 1 reply; 19+ messages in thread
From: 钟居哲 @ 2023-08-14 13:28 UTC (permalink / raw)
  To: linkw; +Cc: rdapp.gcc, richard.sandiford, rguenther, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 5261 bytes --]

Thanks Kewen.

But I saw there is 2 more files include:

+#include "memmodel.h"
+#include "optabs.h"

Not sure whether Richard and Richi ok with that change ?

Thanks.



juzhe.zhong@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 20:45
To: juzhe.zhong@rivai.ai
CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
Hi Juzhe,
 
on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> Hi, Kewin.
> 
> Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?
 
The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
its required new includes as below:
 
+#include "memmodel.h"
+#include "optabs.h"
Could you have a double check?
 
Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
the v5 posting to you.  Thanks!
 
BR,
Kewen
-----
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bc3063c3615..5ae9f69c7eb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
#include "tree-pass.h"
#include "ssa.h"
#include "optabs-tree.h"
+#include "memmodel.h"
+#include "optabs.h"
#include "diagnostic-core.h"
#include "fold-const.h"
#include "stor-layout.h"
@@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       /* No transformation required.  */
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
-   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-        OPTIMIZE_FOR_SPEED))
-     {
-       if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't operate on partial vectors "
- "because the target doesn't support extract "
- "last reduction.\n");
-       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-     }
-   else if (slp_node)
+   if (slp_node)
    {
      if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
  else
    {
      gcc_assert (ncopies == 1 && !slp_node);
-       vect_record_loop_mask (loop_vinfo,
-      &LOOP_VINFO_MASKS (loop_vinfo),
-      1, vectype, NULL);
+       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+   OPTIMIZE_FOR_SPEED))
+ vect_record_loop_mask (loop_vinfo,
+        &LOOP_VINFO_MASKS (loop_vinfo),
+        1, vectype, NULL);
+       else if (can_vec_extract_var_idx_p (
+ TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
+ vect_record_loop_len (loop_vinfo,
+       &LOOP_VINFO_LENS (loop_vinfo),
+       1, vectype, 1);
+       else
+ {
+   if (dump_enabled_p ())
+     dump_printf_loc (
+       MSG_MISSED_OPTIMIZATION, vect_location,
+       "can't operate on partial vectors "
+       "because the target doesn't support extract "
+       "last reduction.\n");
+   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
    }
}
       /* ???  Enable for loop costing as well.  */
@@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 
       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+   /* Emit:
+
+        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+      where VEC_LHS is the vectorized live-out result and MASK is
+      the loop mask for the final iteration.  */
+   gcc_assert (ncopies == 1 && !slp_node);
+   gimple_seq tem = NULL;
+   gimple_stmt_iterator gsi = gsi_last (tem);
+   tree len
+     = vect_get_loop_len (loop_vinfo, &gsi,
+ &LOOP_VINFO_LENS (loop_vinfo),
+ 1, vectype, 0, 0);
+
+   /* BIAS - 1.  */
+   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+   tree bias_minus_one
+     = int_const_binop (MINUS_EXPR,
+        build_int_cst (TREE_TYPE (len), biasval),
+        build_one_cst (TREE_TYPE (len)));
+
+   /* LAST_INDEX = LEN + (BIAS - 1).  */
+   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+   len, bias_minus_one);
+
+   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+   tree scalar_res
+     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+     vec_lhs_phi, last_index);
+
+   /* Convert the extracted vector element to the scalar type.  */
+   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+ }
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
{
  /* Emit:
 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14 13:28               ` 钟居哲
@ 2023-08-14 13:44                 ` Richard Biener
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Biener @ 2023-08-14 13:44 UTC (permalink / raw)
  To: 钟居哲; +Cc: linkw, rdapp.gcc, richard.sandiford, gcc-patches

On Mon, 14 Aug 2023, ??? wrote:

> Thanks Kewen.
> 
> But I saw there is 2 more files include:
> 
> +#include "memmodel.h"
> +#include "optabs.h"
> 
> Not sure whether Richard and Richi ok with that change ?

Yes, please just apply some common sense.

> Thanks.
> 
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Kewen.Lin
> Date: 2023-08-14 20:45
> To: juzhe.zhong@rivai.ai
> CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches
> Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
> Hi Juzhe,
>  
> on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> > Hi, Kewin.
> > 
> > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?
>  
> The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
> previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
> its required new includes as below:
>  
> +#include "memmodel.h"
> +#include "optabs.h"
> Could you have a double check?
>  
> Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
> the v5 posting to you.  Thanks!
>  
> BR,
> Kewen
> -----
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bc3063c3615..5ae9f69c7eb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
> #include "tree-pass.h"
> #include "ssa.h"
> #include "optabs-tree.h"
> +#include "memmodel.h"
> +#include "optabs.h"
> #include "diagnostic-core.h"
> #include "fold-const.h"
> #include "stor-layout.h"
> @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> {
> -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -        OPTIMIZE_FOR_SPEED))
> -     {
> -       if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -     }
> -   else if (slp_node)
> +   if (slp_node)
>     {
>       if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>   else
>     {
>       gcc_assert (ncopies == 1 && !slp_node);
> -       vect_record_loop_mask (loop_vinfo,
> -      &LOOP_VINFO_MASKS (loop_vinfo),
> -      1, vectype, NULL);
> +       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +   OPTIMIZE_FOR_SPEED))
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo),
> +        1, vectype, NULL);
> +       else if (can_vec_extract_var_idx_p (
> + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo),
> +       1, vectype, 1);
> +       else
> + {
> +   if (dump_enabled_p ())
> +     dump_printf_loc (
> +       MSG_MISSED_OPTIMIZATION, vect_location,
> +       "can't operate on partial vectors "
> +       "because the target doesn't support extract "
> +       "last reduction.\n");
> +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
>     }
> }
>        /* ???  Enable for loop costing as well.  */
> @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   /* Emit:
> +
> +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +   gcc_assert (ncopies == 1 && !slp_node);
> +   gimple_seq tem = NULL;
> +   gimple_stmt_iterator gsi = gsi_last (tem);
> +   tree len
> +     = vect_get_loop_len (loop_vinfo, &gsi,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> +   /* BIAS - 1.  */
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias_minus_one
> +     = int_const_binop (MINUS_EXPR,
> +        build_int_cst (TREE_TYPE (len), biasval),
> +        build_one_cst (TREE_TYPE (len)));
> +
> +   /* LAST_INDEX = LEN + (BIAS - 1).  */
> +   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +   len, bias_minus_one);
> +
> +   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +   tree scalar_res
> +     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +     vec_lhs_phi, last_index);
> +
> +   /* Convert the extracted vector element to the scalar type.  */
> +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> + }
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> {
>   /* Emit:
>  
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14 12:45             ` Kewen.Lin
  2023-08-14 13:28               ` 钟居哲
@ 2023-08-14 18:51               ` Stefan Schulze Frielinghaus
  2023-08-15  6:25                 ` Kewen.Lin
  1 sibling, 1 reply; 19+ messages in thread
From: Stefan Schulze Frielinghaus @ 2023-08-14 18:51 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: juzhe.zhong, Robin Dapp, richard.sandiford, rguenther, GCC Patches

Hi everyone,

I have bootstrapped and regtested the patch below on s390.  For the
64-bit target I do not see any changes regarding the testsuite.  For the
31-bit target I see the following failures:

FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable

I've randomely picked pr50451.c and ran gcc against it which results in:

during GIMPLE pass: vect
dump file: pr50451.c.174t.vect
/gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ‘foo’:
/gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
0x1265d21 opt_mode<scalar_int_mode>::require() const
        /gcc-verify-workdir/patched/src/gcc/machmode.h:313
0x1d7e4e9 opt_mode<machine_mode>::require() const
        /gcc-verify-workdir/patched/src/gcc/vec.h:955
0x1d7e4e9 vect_verify_loop_lens
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
0x1da29ab vect_analyze_loop_2
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
0x1da40c7 vect_analyze_loop_1
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
0x1deed27 try_vectorize_loop_1
        /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
0x1deed27 try_vectorize_loop
        /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
0x1def5c1 execute
        /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
Please submit a full bug report, with preprocessed source (by using -freport-bug).
Please include the complete backtrace with any bug report.
See <https://gcc.gnu.org/bugs/> for instructions.

I will come back to this tomorrow.

Cheers,
Stefan

On Mon, Aug 14, 2023 at 08:45:21PM +0800, Kewen.Lin via Gcc-patches wrote:
> Hi Juzhe,
> 
> on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> > Hi, Kewin.
> > 
> > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?
> 
> The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
> previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
> its required new includes as below:
> 
> +#include "memmodel.h"
> +#include "optabs.h"
>  
> Could you have a double check?
> 
> Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
> the v5 posting to you.  Thanks!
> 
> BR,
> Kewen
> -----
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bc3063c3615..5ae9f69c7eb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-pass.h"
>  #include "ssa.h"
>  #include "optabs-tree.h"
> +#include "memmodel.h"
> +#include "optabs.h"
>  #include "diagnostic-core.h"
>  #include "fold-const.h"
>  #include "stor-layout.h"
> @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  	{
> -	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -					       OPTIMIZE_FOR_SPEED))
> -	    {
> -	      if (dump_enabled_p ())
> -		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -				 "can't operate on partial vectors "
> -				 "because the target doesn't support extract "
> -				 "last reduction.\n");
> -	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -	    }
> -	  else if (slp_node)
> +	  if (slp_node)
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	  else
>  	    {
>  	      gcc_assert (ncopies == 1 && !slp_node);
> -	      vect_record_loop_mask (loop_vinfo,
> -				     &LOOP_VINFO_MASKS (loop_vinfo),
> -				     1, vectype, NULL);
> +	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +						  OPTIMIZE_FOR_SPEED))
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo),
> +				       1, vectype, NULL);
> +	      else if (can_vec_extract_var_idx_p (
> +			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo),
> +				      1, vectype, 1);
> +	      else
> +		{
> +		  if (dump_enabled_p ())
> +		    dump_printf_loc (
> +		      MSG_MISSED_OPTIMIZATION, vect_location,
> +		      "can't operate on partial vectors "
> +		      "because the target doesn't support extract "
> +		      "last reduction.\n");
> +		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +		}
>  	    }
>  	}
>        /* ???  Enable for loop costing as well.  */
> @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> 
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> 
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	{
> +	  /* Emit:
> +
> +	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +	     where VEC_LHS is the vectorized live-out result and MASK is
> +	     the loop mask for the final iteration.  */
> +	  gcc_assert (ncopies == 1 && !slp_node);
> +	  gimple_seq tem = NULL;
> +	  gimple_stmt_iterator gsi = gsi_last (tem);
> +	  tree len
> +	    = vect_get_loop_len (loop_vinfo, &gsi,
> +				 &LOOP_VINFO_LENS (loop_vinfo),
> +				 1, vectype, 0, 0);
> +
> +	  /* BIAS - 1.  */
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	  tree bias_minus_one
> +	    = int_const_binop (MINUS_EXPR,
> +			       build_int_cst (TREE_TYPE (len), biasval),
> +			       build_one_cst (TREE_TYPE (len)));
> +
> +	  /* LAST_INDEX = LEN + (BIAS - 1).  */
> +	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +					  len, bias_minus_one);
> +
> +	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +	  tree scalar_res
> +	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +			    vec_lhs_phi, last_index);
> +
> +	  /* Convert the extracted vector element to the scalar type.  */
> +	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +	}
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	{
>  	  /* Emit:

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-14 18:51               ` Stefan Schulze Frielinghaus
@ 2023-08-15  6:25                 ` Kewen.Lin
  2023-08-15  6:58                   ` Richard Biener
  0 siblings, 1 reply; 19+ messages in thread
From: Kewen.Lin @ 2023-08-15  6:25 UTC (permalink / raw)
  To: Stefan Schulze Frielinghaus
  Cc: juzhe.zhong, Robin Dapp, richard.sandiford, rguenther, GCC Patches

Hi Stefan,

on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
> Hi everyone,
> 
> I have bootstrapped and regtested the patch below on s390.  For the
> 64-bit target I do not see any changes regarding the testsuite.  For the
> 31-bit target I see the following failures:
> 
> FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
> FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
> FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
> FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
> FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
> FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
> FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
> FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
> UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
> UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
> UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
> UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
> UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
> UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
> 
> I've randomely picked pr50451.c and ran gcc against it which results in:
> 
> during GIMPLE pass: vect
> dump file: pr50451.c.174t.vect
> /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ‘foo’:
> /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
> 0x1265d21 opt_mode<scalar_int_mode>::require() const
>         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
> 0x1d7e4e9 opt_mode<machine_mode>::require() const
>         /gcc-verify-workdir/patched/src/gcc/vec.h:955
> 0x1d7e4e9 vect_verify_loop_lens
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
> 0x1da29ab vect_analyze_loop_2
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
> 0x1da40c7 vect_analyze_loop_1
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
> 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
> 0x1deed27 try_vectorize_loop_1
>         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
> 0x1deed27 try_vectorize_loop
>         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
> 0x1def5c1 execute
>         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
> Please submit a full bug report, with preprocessed source (by using -freport-bug).
> Please include the complete backtrace with any bug report.
> See <https://gcc.gnu.org/bugs/> for instructions.
> 

It looks like s390 supports variable index vec_extract at -m31 but
no vector with length.  It seems we need to further check the vector
with length capability, with something like:

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5ae9f69c7eb..ef754467baf 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
                 vect_record_loop_mask (loop_vinfo,
                                        &LOOP_VINFO_MASKS (loop_vinfo),
                                        1, vectype, NULL);
-              else if (can_vec_extract_var_idx_p (
+              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
+                         .exists ()
+                       && can_vec_extract_var_idx_p (
                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
-                vect_record_loop_len (loop_vinfo,
-                                      &LOOP_VINFO_LENS (loop_vinfo),
+                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);
               else
                 {

sigh, the formatting looks odd.

BR,
Kewen


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-15  6:25                 ` Kewen.Lin
@ 2023-08-15  6:58                   ` Richard Biener
  2023-08-15  8:52                     ` Richard Sandiford
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Biener @ 2023-08-15  6:58 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: Stefan Schulze Frielinghaus, juzhe.zhong, Robin Dapp,
	richard.sandiford, GCC Patches

On Tue, 15 Aug 2023, Kewen.Lin wrote:

> Hi Stefan,
> 
> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
> > Hi everyone,
> > 
> > I have bootstrapped and regtested the patch below on s390.  For the
> > 64-bit target I do not see any changes regarding the testsuite.  For the
> > 31-bit target I see the following failures:
> > 
> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
> > 
> > I've randomely picked pr50451.c and ran gcc against it which results in:
> > 
> > during GIMPLE pass: vect
> > dump file: pr50451.c.174t.vect
> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
> > 0x1d7e4e9 vect_verify_loop_lens
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
> > 0x1da29ab vect_analyze_loop_2
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
> > 0x1da40c7 vect_analyze_loop_1
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
> > 0x1deed27 try_vectorize_loop_1
> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
> > 0x1deed27 try_vectorize_loop
> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
> > 0x1def5c1 execute
> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
> > Please include the complete backtrace with any bug report.
> > See <https://gcc.gnu.org/bugs/> for instructions.
> > 
> 
> It looks like s390 supports variable index vec_extract at -m31 but
> no vector with length.  It seems we need to further check the vector
> with length capability, with something like:
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 5ae9f69c7eb..ef754467baf 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>                  vect_record_loop_mask (loop_vinfo,
>                                         &LOOP_VINFO_MASKS (loop_vinfo),
>                                         1, vectype, NULL);
> -              else if (can_vec_extract_var_idx_p (
> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
> +                         .exists ()
> +                       && can_vec_extract_var_idx_p (
>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> -                vect_record_loop_len (loop_vinfo,
> -                                      &LOOP_VINFO_LENS (loop_vinfo),
> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
>                                        1, vectype, 1);
>                else
>                  {
> 
> sigh, the formatting looks odd.

I think the error is in vect_verify_loop_lens which assumes that
when we record _any_ length related op the target has to support
both len_load and len_store.  Now that we have many other _len
functions that's certainly not true.

Instead a vect_verify_loop_lens-local "fix" would be to not use
.require () but instead when !.exists () simply return false.
That would still effectively require both len-load and len-store
for any -len predicated loop, but at least avoid the ICE.

Richard.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-15  6:58                   ` Richard Biener
@ 2023-08-15  8:52                     ` Richard Sandiford
  2023-08-15  8:55                       ` Richard Biener
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Sandiford @ 2023-08-15  8:52 UTC (permalink / raw)
  To: Richard Biener
  Cc: Kewen.Lin, Stefan Schulze Frielinghaus, juzhe.zhong, Robin Dapp,
	GCC Patches

Richard Biener <rguenther@suse.de> writes:
> On Tue, 15 Aug 2023, Kewen.Lin wrote:
>
>> Hi Stefan,
>> 
>> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
>> > Hi everyone,
>> > 
>> > I have bootstrapped and regtested the patch below on s390.  For the
>> > 64-bit target I do not see any changes regarding the testsuite.  For the
>> > 31-bit target I see the following failures:
>> > 
>> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
>> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
>> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
>> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
>> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
>> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
>> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
>> > 
>> > I've randomely picked pr50451.c and ran gcc against it which results in:
>> > 
>> > during GIMPLE pass: vect
>> > dump file: pr50451.c.174t.vect
>> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
>> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
>> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
>> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
>> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
>> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
>> > 0x1d7e4e9 vect_verify_loop_lens
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
>> > 0x1da29ab vect_analyze_loop_2
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
>> > 0x1da40c7 vect_analyze_loop_1
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
>> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
>> > 0x1deed27 try_vectorize_loop_1
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
>> > 0x1deed27 try_vectorize_loop
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
>> > 0x1def5c1 execute
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
>> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
>> > Please include the complete backtrace with any bug report.
>> > See <https://gcc.gnu.org/bugs/> for instructions.
>> > 
>> 
>> It looks like s390 supports variable index vec_extract at -m31 but
>> no vector with length.  It seems we need to further check the vector
>> with length capability, with something like:
>> 
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index 5ae9f69c7eb..ef754467baf 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>>                  vect_record_loop_mask (loop_vinfo,
>>                                         &LOOP_VINFO_MASKS (loop_vinfo),
>>                                         1, vectype, NULL);
>> -              else if (can_vec_extract_var_idx_p (
>> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
>> +                         .exists ()
>> +                       && can_vec_extract_var_idx_p (
>>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
>> -                vect_record_loop_len (loop_vinfo,
>> -                                      &LOOP_VINFO_LENS (loop_vinfo),
>> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
>>                                        1, vectype, 1);
>>                else
>>                  {
>> 
>> sigh, the formatting looks odd.
>
> I think the error is in vect_verify_loop_lens which assumes that
> when we record _any_ length related op the target has to support
> both len_load and len_store.  Now that we have many other _len
> functions that's certainly not true.
>
> Instead a vect_verify_loop_lens-local "fix" would be to not use
> .require () but instead when !.exists () simply return false.
> That would still effectively require both len-load and len-store
> for any -len predicated loop, but at least avoid the ICE.

Yeah, agree that would be the simplest workaround.  But I think
instead we should require vectorizable_load and vectorizable_store
to record the bias that they want to use (perhaps in a hash_set?).
Then vect_verify_loop_lens can return false if the set has more
than one element.  It can use a bias of 0 if the set is empty.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-15  8:52                     ` Richard Sandiford
@ 2023-08-15  8:55                       ` Richard Biener
  2023-08-15  9:05                         ` Richard Sandiford
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Biener @ 2023-08-15  8:55 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: Kewen.Lin, Stefan Schulze Frielinghaus, juzhe.zhong, Robin Dapp,
	GCC Patches

On Tue, 15 Aug 2023, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > On Tue, 15 Aug 2023, Kewen.Lin wrote:
> >
> >> Hi Stefan,
> >> 
> >> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
> >> > Hi everyone,
> >> > 
> >> > I have bootstrapped and regtested the patch below on s390.  For the
> >> > 64-bit target I do not see any changes regarding the testsuite.  For the
> >> > 31-bit target I see the following failures:
> >> > 
> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
> >> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
> >> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
> >> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
> >> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
> >> > 
> >> > I've randomely picked pr50451.c and ran gcc against it which results in:
> >> > 
> >> > during GIMPLE pass: vect
> >> > dump file: pr50451.c.174t.vect
> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
> >> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
> >> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
> >> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
> >> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
> >> > 0x1d7e4e9 vect_verify_loop_lens
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
> >> > 0x1da29ab vect_analyze_loop_2
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
> >> > 0x1da40c7 vect_analyze_loop_1
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
> >> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
> >> > 0x1deed27 try_vectorize_loop_1
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
> >> > 0x1deed27 try_vectorize_loop
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
> >> > 0x1def5c1 execute
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
> >> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
> >> > Please include the complete backtrace with any bug report.
> >> > See <https://gcc.gnu.org/bugs/> for instructions.
> >> > 
> >> 
> >> It looks like s390 supports variable index vec_extract at -m31 but
> >> no vector with length.  It seems we need to further check the vector
> >> with length capability, with something like:
> >> 
> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >> index 5ae9f69c7eb..ef754467baf 100644
> >> --- a/gcc/tree-vect-loop.cc
> >> +++ b/gcc/tree-vect-loop.cc
> >> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> >>                  vect_record_loop_mask (loop_vinfo,
> >>                                         &LOOP_VINFO_MASKS (loop_vinfo),
> >>                                         1, vectype, NULL);
> >> -              else if (can_vec_extract_var_idx_p (
> >> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
> >> +                         .exists ()
> >> +                       && can_vec_extract_var_idx_p (
> >>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> >> -                vect_record_loop_len (loop_vinfo,
> >> -                                      &LOOP_VINFO_LENS (loop_vinfo),
> >> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
> >>                                        1, vectype, 1);
> >>                else
> >>                  {
> >> 
> >> sigh, the formatting looks odd.
> >
> > I think the error is in vect_verify_loop_lens which assumes that
> > when we record _any_ length related op the target has to support
> > both len_load and len_store.  Now that we have many other _len
> > functions that's certainly not true.
> >
> > Instead a vect_verify_loop_lens-local "fix" would be to not use
> > .require () but instead when !.exists () simply return false.
> > That would still effectively require both len-load and len-store
> > for any -len predicated loop, but at least avoid the ICE.
> 
> Yeah, agree that would be the simplest workaround.  But I think
> instead we should require vectorizable_load and vectorizable_store
> to record the bias that they want to use (perhaps in a hash_set?).
> Then vect_verify_loop_lens can return false if the set has more
> than one element.  It can use a bias of 0 if the set is empty.

But with all the other _LEN fns now also having a bias, never
quering it but using the one from len_{load,store} having
that supported looks like a requirement?  OTOH if we would require
querying it for each used _LEN fn then supporting multiple
different biases wouldn't be an issue either?

Richard.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
  2023-08-15  8:55                       ` Richard Biener
@ 2023-08-15  9:05                         ` Richard Sandiford
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Sandiford @ 2023-08-15  9:05 UTC (permalink / raw)
  To: Richard Biener
  Cc: Kewen.Lin, Stefan Schulze Frielinghaus, juzhe.zhong, Robin Dapp,
	GCC Patches

Richard Biener <rguenther@suse.de> writes:
> On Tue, 15 Aug 2023, Richard Sandiford wrote:
>
>> Richard Biener <rguenther@suse.de> writes:
>> > On Tue, 15 Aug 2023, Kewen.Lin wrote:
>> >
>> >> Hi Stefan,
>> >> 
>> >> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
>> >> > Hi everyone,
>> >> > 
>> >> > I have bootstrapped and regtested the patch below on s390.  For the
>> >> > 64-bit target I do not see any changes regarding the testsuite.  For the
>> >> > 31-bit target I see the following failures:
>> >> > 
>> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
>> >> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
>> >> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
>> >> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
>> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
>> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
>> >> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
>> >> > 
>> >> > I've randomely picked pr50451.c and ran gcc against it which results in:
>> >> > 
>> >> > during GIMPLE pass: vect
>> >> > dump file: pr50451.c.174t.vect
>> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
>> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
>> >> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
>> >> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
>> >> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
>> >> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
>> >> > 0x1d7e4e9 vect_verify_loop_lens
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
>> >> > 0x1da29ab vect_analyze_loop_2
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
>> >> > 0x1da40c7 vect_analyze_loop_1
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
>> >> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
>> >> > 0x1deed27 try_vectorize_loop_1
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
>> >> > 0x1deed27 try_vectorize_loop
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
>> >> > 0x1def5c1 execute
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
>> >> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
>> >> > Please include the complete backtrace with any bug report.
>> >> > See <https://gcc.gnu.org/bugs/> for instructions.
>> >> > 
>> >> 
>> >> It looks like s390 supports variable index vec_extract at -m31 but
>> >> no vector with length.  It seems we need to further check the vector
>> >> with length capability, with something like:
>> >> 
>> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> >> index 5ae9f69c7eb..ef754467baf 100644
>> >> --- a/gcc/tree-vect-loop.cc
>> >> +++ b/gcc/tree-vect-loop.cc
>> >> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>> >>                  vect_record_loop_mask (loop_vinfo,
>> >>                                         &LOOP_VINFO_MASKS (loop_vinfo),
>> >>                                         1, vectype, NULL);
>> >> -              else if (can_vec_extract_var_idx_p (
>> >> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
>> >> +                         .exists ()
>> >> +                       && can_vec_extract_var_idx_p (
>> >>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
>> >> -                vect_record_loop_len (loop_vinfo,
>> >> -                                      &LOOP_VINFO_LENS (loop_vinfo),
>> >> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
>> >>                                        1, vectype, 1);
>> >>                else
>> >>                  {
>> >> 
>> >> sigh, the formatting looks odd.
>> >
>> > I think the error is in vect_verify_loop_lens which assumes that
>> > when we record _any_ length related op the target has to support
>> > both len_load and len_store.  Now that we have many other _len
>> > functions that's certainly not true.
>> >
>> > Instead a vect_verify_loop_lens-local "fix" would be to not use
>> > .require () but instead when !.exists () simply return false.
>> > That would still effectively require both len-load and len-store
>> > for any -len predicated loop, but at least avoid the ICE.
>> 
>> Yeah, agree that would be the simplest workaround.  But I think
>> instead we should require vectorizable_load and vectorizable_store
>> to record the bias that they want to use (perhaps in a hash_set?).
>> Then vect_verify_loop_lens can return false if the set has more
>> than one element.  It can use a bias of 0 if the set is empty.
>
> But with all the other _LEN fns now also having a bias, never
> quering it but using the one from len_{load,store} having
> that supported looks like a requirement?  OTOH if we would require
> querying it for each used _LEN fn then supporting multiple
> different biases wouldn't be an issue either?

Yeah, the OTOH is what I think we should do.  The bias stuff dates
back to when loads and stores were the only LEN_* operations.  The reason
it checks both loads and stores is to ensure consistency between "all"
length operations.

If instead we took the position that LEN_LOAD determines the biases
for everything, we wouldn't need to check LEN_STORE biases.

But the biases need to be consistent because they're built into the IVs.
In principle we could create a different LEN input for each bias,
but that's pointless when no target needs it.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2023-08-15  9:05 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-11 14:24 [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization juzhe.zhong
2023-08-14  6:53 ` Richard Biener
2023-08-14  7:09   ` juzhe.zhong
2023-08-14  8:51     ` Kewen.Lin
2023-08-14  8:58       ` Robin Dapp
2023-08-14  9:15         ` Richard Biener
2023-08-14  9:19         ` Kewen.Lin
2023-08-14  9:26           ` juzhe.zhong
2023-08-14 12:07             ` Richard Biener
2023-08-14 12:08           ` juzhe.zhong
2023-08-14 12:45             ` Kewen.Lin
2023-08-14 13:28               ` 钟居哲
2023-08-14 13:44                 ` Richard Biener
2023-08-14 18:51               ` Stefan Schulze Frielinghaus
2023-08-15  6:25                 ` Kewen.Lin
2023-08-15  6:58                   ` Richard Biener
2023-08-15  8:52                     ` Richard Sandiford
2023-08-15  8:55                       ` Richard Biener
2023-08-15  9:05                         ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).