[PATCH V3] VECT: Enhance SLP of MASK_LEN_GATHER

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH V3] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721]
@ 2023-10-13  4:29 Juzhe-Zhong
  2023-10-16 21:34 ` Richard Sandiford
  0 siblings, 1 reply; 3+ messages in thread
From: Juzhe-Zhong @ 2023-10-13  4:29 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford, rguenther, Juzhe-Zhong

This patch fixes this following FAILs in RISC-V regression:

FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects  scan-tree-dump vect "Loop contains only SLP stmts"
FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts"
FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects  scan-tree-dump vect "Loop contains only SLP stmts"
FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts"

The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD.

We have 2 following situations of scalar recognized MASK_LEN_GATHER_LOAD:

1. conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, condtional mask).
   
   This situation we just need to leverage the current MASK_GATHER_LOAD which can achieve SLP MASK_LEN_GATHER_LOAD.

2. un-conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, -1)
   
   Current SLP check will failed on dummy mask -1, so we relax the check in tree-vect-slp.cc and allow it to be materialized.
    
Consider this following case:

void __attribute__((noipa))
f (int *restrict y, int *restrict x, int *restrict indices, int n)
{
  for (int i = 0; i < n; ++i)
    {
      y[i * 2] = x[indices[i * 2]] + 1;
      y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
    }
}

https://godbolt.org/z/WG3M3n7Mo

GCC unable to SLP using VEC_LOAD_LANES/VEC_STORE_LANES:

f:
        ble     a3,zero,.L5
.L3:
        vsetvli a5,a3,e8,mf4,ta,ma
        vsetvli zero,a5,e32,m1,ta,ma
        vlseg2e32.v     v6,(a2)
        vsetvli a4,zero,e64,m2,ta,ma
        vsext.vf2       v2,v6
        vsll.vi v2,v2,2
        vsetvli zero,a5,e32,m1,ta,ma
        vluxei64.v      v1,(a1),v2
        vsetvli a4,zero,e64,m2,ta,ma
        vsext.vf2       v2,v7
        vsetvli zero,zero,e32,m1,ta,ma
        vadd.vi v4,v1,1
        vsetvli zero,zero,e64,m2,ta,ma
        vsll.vi v2,v2,2
        vsetvli zero,a5,e32,m1,ta,ma
        vluxei64.v      v2,(a1),v2
        vsetvli a4,zero,e32,m1,ta,ma
        slli    a6,a5,3
        vadd.vi v5,v2,2
        sub     a3,a3,a5
        vsetvli zero,a5,e32,m1,ta,ma
        vsseg2e32.v     v4,(a0)
        add     a2,a2,a6
        add     a0,a0,a6
        bne     a3,zero,.L3
.L5:
        ret

After this patch:

f:
	ble	a3,zero,.L5
	li	a5,1
	csrr	t1,vlenb
	slli	a5,a5,33
	srli	a7,t1,2
	addi	a5,a5,1
	slli	a3,a3,1
	neg	t3,a7
	vsetvli	a4,zero,e64,m1,ta,ma
	vmv.v.x	v4,a5
.L3:
	minu	a5,a3,a7
	vsetvli	zero,a5,e32,m1,ta,ma
	vle32.v	v1,0(a2)
	vsetvli	a4,zero,e64,m2,ta,ma
	vsext.vf2	v2,v1
	vsll.vi	v2,v2,2
	vsetvli	zero,a5,e32,m1,ta,ma
	vluxei64.v	v2,(a1),v2
	vsetvli	a4,zero,e32,m1,ta,ma
	mv	a6,a3
	vadd.vv	v2,v2,v4
	vsetvli	zero,a5,e32,m1,ta,ma
	vse32.v	v2,0(a0)
	add	a2,a2,t1
	add	a0,a0,t1
	add	a3,a3,t3
	bgtu	a6,a7,.L3
.L5:
	ret

Note that I found we are missing conditional mask gather_load SLP test, Append a test for it in this patch.

Tested on RISC-V and Bootstrap && Regression on X86 passed.

Ok for trunk ?

gcc/ChangeLog:

	* tree-vect-slp.cc (vect_get_operand_map): Add MASK_LEN_GATHER_LOAD.
	(vect_get_and_check_slp_defs): Ditto.
	(vect_build_slp_tree_1): Ditto.
	(vect_build_slp_tree_2): Ditto.
	* tree-vect-stmts.cc (vectorizable_load): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-gather-6.c: New test.

---
 gcc/testsuite/gcc.dg/vect/vect-gather-6.c | 15 +++++++++++++++
 gcc/tree-vect-slp.cc                      | 22 ++++++++++++++++++----
 gcc/tree-vect-stmts.cc                    | 10 +++++++++-
 3 files changed, 42 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-6.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-6.c b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
new file mode 100644
index 00000000000..ff55f321854
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+void
+f (int *restrict y, int *restrict x, int *restrict indices, int *restrict cond, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      if (cond[i * 2])
+	y[i * 2] = x[indices[i * 2]] + 1;
+      if (cond[i * 2 + 1])
+	y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
+    }
+}
+
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index fa098f9ff4e..38fe6ba6296 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -542,6 +542,7 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 	    return arg1_map;
 
 	  case IFN_MASK_GATHER_LOAD:
+	  case IFN_MASK_LEN_GATHER_LOAD:
 	    return arg1_arg4_map;
 
 	  case IFN_MASK_STORE:
@@ -700,8 +701,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 	{
 	  tree type = TREE_TYPE (oprnd);
 	  dt = dts[i];
-	  if ((dt == vect_constant_def
-	       || dt == vect_external_def)
+	  if (dt == vect_external_def
 	      && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 	      && (TREE_CODE (type) == BOOLEAN_TYPE
 		  || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
@@ -713,6 +713,17 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 				 "for variable-length SLP %T\n", oprnd);
 	      return -1;
 	    }
+	  if (dt == vect_constant_def
+	      && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
+	      && !can_duplicate_and_interleave_p (vinfo, stmts.length (), type))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "Build SLP failed: invalid type of def "
+				 "for variable-length SLP %T\n",
+				 oprnd);
+	      return -1;
+	    }
 
 	  /* For the swapping logic below force vect_reduction_def
 	     for the reduction op in a SLP reduction group.  */
@@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 
 	  if (cfn == CFN_MASK_LOAD
 	      || cfn == CFN_GATHER_LOAD
-	      || cfn == CFN_MASK_GATHER_LOAD)
+	      || cfn == CFN_MASK_GATHER_LOAD
+	      || cfn == CFN_MASK_LEN_GATHER_LOAD)
 	    ldst_p = true;
 	  else if (cfn == CFN_MASK_STORE)
 	    {
@@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	  if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
 	      && rhs_code != CFN_GATHER_LOAD
 	      && rhs_code != CFN_MASK_GATHER_LOAD
+	      && rhs_code != CFN_MASK_LEN_GATHER_LOAD
 	      /* Not grouped loads are handled as externals for BB
 		 vectorization.  For loop vectorization we can handle
 		 splats the same we handle single element interleaving.  */
@@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 	gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
 		    || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
-		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
+		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
+		    || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
       else
 	{
 	  *max_nunits = this_max_nunits;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ce925cc1d53..994234eec08 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9738,12 +9738,20 @@ vectorizable_load (vec_info *vinfo,
 	return false;
 
       mask_index = internal_fn_mask_index (ifn);
+      slp_tree slp_op = NULL;
       if (mask_index >= 0 && slp_node)
 	mask_index = vect_slp_child_index_for_operand (call, mask_index);
       if (mask_index >= 0
 	  && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
-				      &mask, NULL, &mask_dt, &mask_vectype))
+				      &mask, &slp_op, &mask_dt, &mask_vectype))
 	return false;
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, mask_vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "incompatible vector types for invariants\n");
+	  return false;
+	}
     }
 
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-- 
2.36.3


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH V3] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721]
  2023-10-13  4:29 [PATCH V3] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] Juzhe-Zhong
@ 2023-10-16 21:34 ` Richard Sandiford
  2023-10-17  6:46   ` juzhe.zhong
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Sandiford @ 2023-10-16 21:34 UTC (permalink / raw)
  To: Juzhe-Zhong; +Cc: gcc-patches, rguenther

Juzhe-Zhong <juzhe.zhong@rivai.ai> writes:
> This patch fixes this following FAILs in RISC-V regression:
>
> FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects  scan-tree-dump vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects  scan-tree-dump vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts"
>
> The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD.
>
> We have 2 following situations of scalar recognized MASK_LEN_GATHER_LOAD:
>
> 1. conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, condtional mask).
>    
>    This situation we just need to leverage the current MASK_GATHER_LOAD which can achieve SLP MASK_LEN_GATHER_LOAD.
>
> 2. un-conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, -1)
>    
>    Current SLP check will failed on dummy mask -1, so we relax the check in tree-vect-slp.cc and allow it to be materialized.
>     
> Consider this following case:
>
> void __attribute__((noipa))
> f (int *restrict y, int *restrict x, int *restrict indices, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       y[i * 2] = x[indices[i * 2]] + 1;
>       y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
>     }
> }
>
> https://godbolt.org/z/WG3M3n7Mo
>
> GCC unable to SLP using VEC_LOAD_LANES/VEC_STORE_LANES:
>
> f:
>         ble     a3,zero,.L5
> .L3:
>         vsetvli a5,a3,e8,mf4,ta,ma
>         vsetvli zero,a5,e32,m1,ta,ma
>         vlseg2e32.v     v6,(a2)
>         vsetvli a4,zero,e64,m2,ta,ma
>         vsext.vf2       v2,v6
>         vsll.vi v2,v2,2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vluxei64.v      v1,(a1),v2
>         vsetvli a4,zero,e64,m2,ta,ma
>         vsext.vf2       v2,v7
>         vsetvli zero,zero,e32,m1,ta,ma
>         vadd.vi v4,v1,1
>         vsetvli zero,zero,e64,m2,ta,ma
>         vsll.vi v2,v2,2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vluxei64.v      v2,(a1),v2
>         vsetvli a4,zero,e32,m1,ta,ma
>         slli    a6,a5,3
>         vadd.vi v5,v2,2
>         sub     a3,a3,a5
>         vsetvli zero,a5,e32,m1,ta,ma
>         vsseg2e32.v     v4,(a0)
>         add     a2,a2,a6
>         add     a0,a0,a6
>         bne     a3,zero,.L3
> .L5:
>         ret
>
> After this patch:
>
> f:
> 	ble	a3,zero,.L5
> 	li	a5,1
> 	csrr	t1,vlenb
> 	slli	a5,a5,33
> 	srli	a7,t1,2
> 	addi	a5,a5,1
> 	slli	a3,a3,1
> 	neg	t3,a7
> 	vsetvli	a4,zero,e64,m1,ta,ma
> 	vmv.v.x	v4,a5
> .L3:
> 	minu	a5,a3,a7
> 	vsetvli	zero,a5,e32,m1,ta,ma
> 	vle32.v	v1,0(a2)
> 	vsetvli	a4,zero,e64,m2,ta,ma
> 	vsext.vf2	v2,v1
> 	vsll.vi	v2,v2,2
> 	vsetvli	zero,a5,e32,m1,ta,ma
> 	vluxei64.v	v2,(a1),v2
> 	vsetvli	a4,zero,e32,m1,ta,ma
> 	mv	a6,a3
> 	vadd.vv	v2,v2,v4
> 	vsetvli	zero,a5,e32,m1,ta,ma
> 	vse32.v	v2,0(a0)
> 	add	a2,a2,t1
> 	add	a0,a0,t1
> 	add	a3,a3,t3
> 	bgtu	a6,a7,.L3
> .L5:
> 	ret
>
> Note that I found we are missing conditional mask gather_load SLP test, Append a test for it in this patch.

Yeah, we're missing a target-independent test.  I'm afraid I used
aarch64-specific tests for a lot of this stuff, since (a) I wanted
to check the quality of the asm output and (b) it's very hard to write
gcc.dg/vect tests that don't fail on some target or other.  Thanks for
picking this up.

>
> Tested on RISC-V and Bootstrap && Regression on X86 passed.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
> 	* tree-vect-slp.cc (vect_get_operand_map): Add MASK_LEN_GATHER_LOAD.
> 	(vect_get_and_check_slp_defs): Ditto.
> 	(vect_build_slp_tree_1): Ditto.
> 	(vect_build_slp_tree_2): Ditto.
> 	* tree-vect-stmts.cc (vectorizable_load): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-gather-6.c: New test.
>
> ---
>  gcc/testsuite/gcc.dg/vect/vect-gather-6.c | 15 +++++++++++++++
>  gcc/tree-vect-slp.cc                      | 22 ++++++++++++++++++----
>  gcc/tree-vect-stmts.cc                    | 10 +++++++++-
>  3 files changed, 42 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-6.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-6.c b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
> new file mode 100644
> index 00000000000..ff55f321854
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +
> +void
> +f (int *restrict y, int *restrict x, int *restrict indices, int *restrict cond, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    {
> +      if (cond[i * 2])
> +	y[i * 2] = x[indices[i * 2]] + 1;
> +      if (cond[i * 2 + 1])
> +	y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index fa098f9ff4e..38fe6ba6296 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -542,6 +542,7 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
>  	    return arg1_map;
>  
>  	  case IFN_MASK_GATHER_LOAD:
> +	  case IFN_MASK_LEN_GATHER_LOAD:
>  	    return arg1_arg4_map;
>  
>  	  case IFN_MASK_STORE:
> @@ -700,8 +701,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
>  	{
>  	  tree type = TREE_TYPE (oprnd);
>  	  dt = dts[i];
> -	  if ((dt == vect_constant_def
> -	       || dt == vect_external_def)
> +	  if (dt == vect_external_def
>  	      && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
>  	      && (TREE_CODE (type) == BOOLEAN_TYPE
>  		  || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
> @@ -713,6 +713,17 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
>  				 "for variable-length SLP %T\n", oprnd);
>  	      return -1;
>  	    }
> +	  if (dt == vect_constant_def
> +	      && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
> +	      && !can_duplicate_and_interleave_p (vinfo, stmts.length (), type))
> +	    {
> +	      if (dump_enabled_p ())
> +		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +				 "Build SLP failed: invalid type of def "
> +				 "for variable-length SLP %T\n",
> +				 oprnd);
> +	      return -1;
> +	    }
>  
>  	  /* For the swapping logic below force vect_reduction_def
>  	     for the reduction op in a SLP reduction group.  */
> @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
>  
>  	  if (cfn == CFN_MASK_LOAD
>  	      || cfn == CFN_GATHER_LOAD
> -	      || cfn == CFN_MASK_GATHER_LOAD)
> +	      || cfn == CFN_MASK_GATHER_LOAD
> +	      || cfn == CFN_MASK_LEN_GATHER_LOAD)
>  	    ldst_p = true;
>  	  else if (cfn == CFN_MASK_STORE)
>  	    {
> @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
>  	  if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
>  	      && rhs_code != CFN_GATHER_LOAD
>  	      && rhs_code != CFN_MASK_GATHER_LOAD
> +	      && rhs_code != CFN_MASK_LEN_GATHER_LOAD
>  	      /* Not grouped loads are handled as externals for BB
>  		 vectorization.  For loop vectorization we can handle
>  		 splats the same we handle single element interleaving.  */
> @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>  	gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
>  		    || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
> -		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
> +		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
> +		    || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
>        else
>  	{
>  	  *max_nunits = this_max_nunits;
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index ce925cc1d53..994234eec08 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9738,12 +9738,20 @@ vectorizable_load (vec_info *vinfo,
>  	return false;
>  
>        mask_index = internal_fn_mask_index (ifn);
> +      slp_tree slp_op = NULL;
>        if (mask_index >= 0 && slp_node)
>  	mask_index = vect_slp_child_index_for_operand (call, mask_index);
>        if (mask_index >= 0
>  	  && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> -				      &mask, NULL, &mask_dt, &mask_vectype))
> +				      &mask, &slp_op, &mask_dt, &mask_vectype))
>  	return false;
> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, mask_vectype))
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "incompatible vector types for invariants\n");
> +	  return false;
> +	}

slp_op and mask_vectype are only initialised when mask_index >= 0.
Shouldn't this code be under mask_index >= 0 too?

Also, when do we encounter mismatched mask_vectypes?  Presumably the SLP
node has a known vectype by this point.  I think a comment would be useful.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Re: [PATCH V3] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721]
  2023-10-16 21:34 ` Richard Sandiford
@ 2023-10-17  6:46   ` juzhe.zhong
  0 siblings, 0 replies; 3+ messages in thread
From: juzhe.zhong @ 2023-10-17  6:46 UTC (permalink / raw)
  To: richard.sandiford; +Cc: gcc-patches, rguenther

[-- Attachment #1: Type: text/plain, Size: 10076 bytes --]

Hi, Richard.

>> slp_op and mask_vectype are only initialised when mask_index >= 0.
>>Shouldn't this code be under mask_index >= 0 too?
 
>>Also, when do we encounter mismatched mask_vectypes?  Presumably the SLP
>>node has a known vectype by this point.  I think a comment would be useful.

Address comment and I think we won't encounter mismatch mask_vectypes.

So, I changed code in V4 as follows:
+      if (mask_index >= 0 && slp_node)
+	{
+	  bool match_p
+	    = vect_maybe_update_slp_op_vectype (slp_op, mask_vectype);
+	  gcc_assert (match_p);
+	}

https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633209.html 

Assert we always match mask_vectype.

Tested on RISC-V and bootstrap && regtest on X86 passed.

Could you confirm it ?


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-10-17 05:34
To: Juzhe-Zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V3] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721]
Juzhe-Zhong <juzhe.zhong@rivai.ai> writes:
> This patch fixes this following FAILs in RISC-V regression:
>
> FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects  scan-tree-dump vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects  scan-tree-dump vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts"
>
> The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD.
>
> We have 2 following situations of scalar recognized MASK_LEN_GATHER_LOAD:
>
> 1. conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, condtional mask).
>    
>    This situation we just need to leverage the current MASK_GATHER_LOAD which can achieve SLP MASK_LEN_GATHER_LOAD.
>
> 2. un-conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, -1)
>    
>    Current SLP check will failed on dummy mask -1, so we relax the check in tree-vect-slp.cc and allow it to be materialized.
>     
> Consider this following case:
>
> void __attribute__((noipa))
> f (int *restrict y, int *restrict x, int *restrict indices, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       y[i * 2] = x[indices[i * 2]] + 1;
>       y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
>     }
> }
>
> https://godbolt.org/z/WG3M3n7Mo
>
> GCC unable to SLP using VEC_LOAD_LANES/VEC_STORE_LANES:
>
> f:
>         ble     a3,zero,.L5
> .L3:
>         vsetvli a5,a3,e8,mf4,ta,ma
>         vsetvli zero,a5,e32,m1,ta,ma
>         vlseg2e32.v     v6,(a2)
>         vsetvli a4,zero,e64,m2,ta,ma
>         vsext.vf2       v2,v6
>         vsll.vi v2,v2,2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vluxei64.v      v1,(a1),v2
>         vsetvli a4,zero,e64,m2,ta,ma
>         vsext.vf2       v2,v7
>         vsetvli zero,zero,e32,m1,ta,ma
>         vadd.vi v4,v1,1
>         vsetvli zero,zero,e64,m2,ta,ma
>         vsll.vi v2,v2,2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vluxei64.v      v2,(a1),v2
>         vsetvli a4,zero,e32,m1,ta,ma
>         slli    a6,a5,3
>         vadd.vi v5,v2,2
>         sub     a3,a3,a5
>         vsetvli zero,a5,e32,m1,ta,ma
>         vsseg2e32.v     v4,(a0)
>         add     a2,a2,a6
>         add     a0,a0,a6
>         bne     a3,zero,.L3
> .L5:
>         ret
>
> After this patch:
>
> f:
> ble a3,zero,.L5
> li a5,1
> csrr t1,vlenb
> slli a5,a5,33
> srli a7,t1,2
> addi a5,a5,1
> slli a3,a3,1
> neg t3,a7
> vsetvli a4,zero,e64,m1,ta,ma
> vmv.v.x v4,a5
> .L3:
> minu a5,a3,a7
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v1,0(a2)
> vsetvli a4,zero,e64,m2,ta,ma
> vsext.vf2 v2,v1
> vsll.vi v2,v2,2
> vsetvli zero,a5,e32,m1,ta,ma
> vluxei64.v v2,(a1),v2
> vsetvli a4,zero,e32,m1,ta,ma
> mv a6,a3
> vadd.vv v2,v2,v4
> vsetvli zero,a5,e32,m1,ta,ma
> vse32.v v2,0(a0)
> add a2,a2,t1
> add a0,a0,t1
> add a3,a3,t3
> bgtu a6,a7,.L3
> .L5:
> ret
>
> Note that I found we are missing conditional mask gather_load SLP test, Append a test for it in this patch.
 
Yeah, we're missing a target-independent test.  I'm afraid I used
aarch64-specific tests for a lot of this stuff, since (a) I wanted
to check the quality of the asm output and (b) it's very hard to write
gcc.dg/vect tests that don't fail on some target or other.  Thanks for
picking this up.
 
>
> Tested on RISC-V and Bootstrap && Regression on X86 passed.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
> * tree-vect-slp.cc (vect_get_operand_map): Add MASK_LEN_GATHER_LOAD.
> (vect_get_and_check_slp_defs): Ditto.
> (vect_build_slp_tree_1): Ditto.
> (vect_build_slp_tree_2): Ditto.
> * tree-vect-stmts.cc (vectorizable_load): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-gather-6.c: New test.
>
> ---
>  gcc/testsuite/gcc.dg/vect/vect-gather-6.c | 15 +++++++++++++++
>  gcc/tree-vect-slp.cc                      | 22 ++++++++++++++++++----
>  gcc/tree-vect-stmts.cc                    | 10 +++++++++-
>  3 files changed, 42 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-6.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-6.c b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
> new file mode 100644
> index 00000000000..ff55f321854
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +
> +void
> +f (int *restrict y, int *restrict x, int *restrict indices, int *restrict cond, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    {
> +      if (cond[i * 2])
> + y[i * 2] = x[indices[i * 2]] + 1;
> +      if (cond[i * 2 + 1])
> + y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index fa098f9ff4e..38fe6ba6296 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -542,6 +542,7 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
>      return arg1_map;
>  
>    case IFN_MASK_GATHER_LOAD:
> +   case IFN_MASK_LEN_GATHER_LOAD:
>      return arg1_arg4_map;
>  
>    case IFN_MASK_STORE:
> @@ -700,8 +701,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
>  {
>    tree type = TREE_TYPE (oprnd);
>    dt = dts[i];
> -   if ((dt == vect_constant_def
> -        || dt == vect_external_def)
> +   if (dt == vect_external_def
>        && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
>        && (TREE_CODE (type) == BOOLEAN_TYPE
>    || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
> @@ -713,6 +713,17 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
>  "for variable-length SLP %T\n", oprnd);
>        return -1;
>      }
> +   if (dt == vect_constant_def
> +       && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
> +       && !can_duplicate_and_interleave_p (vinfo, stmts.length (), type))
> +     {
> +       if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Build SLP failed: invalid type of def "
> + "for variable-length SLP %T\n",
> + oprnd);
> +       return -1;
> +     }
>  
>    /* For the swapping logic below force vect_reduction_def
>       for the reduction op in a SLP reduction group.  */
> @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
>  
>    if (cfn == CFN_MASK_LOAD
>        || cfn == CFN_GATHER_LOAD
> -       || cfn == CFN_MASK_GATHER_LOAD)
> +       || cfn == CFN_MASK_GATHER_LOAD
> +       || cfn == CFN_MASK_LEN_GATHER_LOAD)
>      ldst_p = true;
>    else if (cfn == CFN_MASK_STORE)
>      {
> @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
>    if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
>        && rhs_code != CFN_GATHER_LOAD
>        && rhs_code != CFN_MASK_GATHER_LOAD
> +       && rhs_code != CFN_MASK_LEN_GATHER_LOAD
>        /* Not grouped loads are handled as externals for BB
>  vectorization.  For loop vectorization we can handle
>  splats the same we handle single element interleaving.  */
> @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>  gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
>      || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
> -     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
> +     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
> +     || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
>        else
>  {
>    *max_nunits = this_max_nunits;
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index ce925cc1d53..994234eec08 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9738,12 +9738,20 @@ vectorizable_load (vec_info *vinfo,
>  return false;
>  
>        mask_index = internal_fn_mask_index (ifn);
> +      slp_tree slp_op = NULL;
>        if (mask_index >= 0 && slp_node)
>  mask_index = vect_slp_child_index_for_operand (call, mask_index);
>        if (mask_index >= 0
>    && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> -       &mask, NULL, &mask_dt, &mask_vectype))
> +       &mask, &slp_op, &mask_dt, &mask_vectype))
>  return false;
> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, mask_vectype))
> + {
> +   if (dump_enabled_p ())
> +     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +      "incompatible vector types for invariants\n");
> +   return false;
> + }
 
slp_op and mask_vectype are only initialised when mask_index >= 0.
Shouldn't this code be under mask_index >= 0 too?
 
Also, when do we encounter mismatched mask_vectypes?  Presumably the SLP
node has a known vectype by this point.  I think a comment would be useful.
 
Thanks,
Richard
 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-10-17  6:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-13  4:29 [PATCH V3] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] Juzhe-Zhong
2023-10-16 21:34 ` Richard Sandiford
2023-10-17  6:46   ` juzhe.zhong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).