[PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
@ 2024-01-05  1:53 Juzhe-Zhong
  2024-01-05  9:54 ` Robin Dapp
  0 siblings, 1 reply; 5+ messages in thread
From: Juzhe-Zhong @ 2024-01-05  1:53 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, kito.cheng, jeffreyalaw, rdapp.gcc, Juzhe-Zhong

1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift with vector shift amount,
that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize shift with scalar shift amount,
that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.

For the 2) case, we don't need to allocate a vector register group for shift amount.

So consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
   int n)
{
  for (int i = 0; i < n; i++)
    {
      int tmp = b[i] >> x;
      int tmp2 = tmp * b[i];
      c[i] = tmp2 * b[i];
      d[i] = tmp * tmp2 * b[i] >> x;
    }
}

Before this patch, we choose LMUL = 4, now after this patch, we can choose LMUL = 8:

f:
	ble	a5,zero,.L5
.L3:
	vsetvli	a0,a5,e32,m8,ta,ma
	slli	a6,a0,2
	vle32.v	v16,0(a1)
	vsra.vx	v24,v16,a4
	vmul.vv	v8,v24,v16
	vmul.vv	v0,v8,v16
	vse32.v	v0,0(a2)
	vmul.vv	v8,v8,v24
	vmul.vv	v8,v8,v16
	vsra.vx	v8,v8,a4
	vse32.v	v8,0(a3)
	add	a1,a1,a6
	add	a2,a2,a6
	add	a3,a3,a6
	sub	a5,a5,a0
	bne	a5,zero,.L3
.L5:
	ret

Tested on both RV32/RV64 no regression.  Ok for trunk ?

Note that we will apply same heuristic for vadd.vx, ... etc when the late-combine pass from
Richard Sandiford is committed (Since we need late combine pass to do vv->vx transformation for vadd).

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): New function.
	(variable_vectorized_p): Teach loop invariant.
	(has_unexpected_spills_p): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc        | 31 +++++++--
 .../costmodel/riscv/rvv/dynamic-lmul4-12.c    | 40 ++++++++++++
 .../costmodel/riscv/rvv/dynamic-lmul8-14.c    | 64 +++++++++++++++++++
 3 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index ec8156fbaf8..00b0b4d64b9 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -230,9 +230,24 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2)
   return mode1_size >= mode2_size ? mode1 : mode2;
 }
 
+/* Return true if OP is invariant.  */
+
+static bool
+loop_invariant_op_p (class loop *loop,
+		     tree op)
+{
+  if (is_gimple_min_invariant (op))
+    return true;
+  if (SSA_NAME_IS_DEFAULT_DEF (op)
+      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op))))
+    return true;
+  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
+}
+
 /* Return true if the variable should be counted into liveness.  */
 static bool
-variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
+variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
+		       bool lhs_p)
 {
   if (!var)
     return false;
@@ -275,6 +290,10 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
 		 || !tree_fits_shwi_p (var)
 		 || !IN_RANGE (tree_to_shwi (var), -16, 15)
 		 || gimple_assign_rhs1 (stmt) != var;
+	case LSHIFT_EXPR:
+	case RSHIFT_EXPR:
+	  return gimple_assign_rhs2 (stmt) != var
+		 || !loop_invariant_op_p (loop, var);
 	default:
 	  break;
 	}
@@ -312,10 +331,12 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
    The live range of SSA 2 is [0, 4] in bb 3.  */
 static machine_mode
 compute_local_live_ranges (
+  loop_vec_info loop_vinfo,
   const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
   hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
 {
   machine_mode biggest_mode = QImode;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   if (!program_points_per_bb.is_empty ())
     {
       auto_vec<tree> visited_vars;
@@ -339,7 +360,8 @@ compute_local_live_ranges (
 	      unsigned int point = program_point.point;
 	      gimple *stmt = program_point.stmt;
 	      tree lhs = gimple_get_lhs (stmt);
-	      if (variable_vectorized_p (program_point.stmt_info, lhs, true))
+	      if (variable_vectorized_p (loop, program_point.stmt_info, lhs,
+					 true))
 		{
 		  biggest_mode = get_biggest_mode (biggest_mode,
 						   TYPE_MODE (TREE_TYPE (lhs)));
@@ -356,7 +378,7 @@ compute_local_live_ranges (
 	      for (i = 0; i < gimple_num_args (stmt); i++)
 		{
 		  tree var = gimple_arg (stmt, i);
-		  if (variable_vectorized_p (program_point.stmt_info, var,
+		  if (variable_vectorized_p (loop, program_point.stmt_info, var,
 					     false))
 		    {
 		      biggest_mode
@@ -781,7 +803,8 @@ has_unexpected_spills_p (loop_vec_info loop_vinfo)
   /* Compute local live ranges.  */
   hash_map<basic_block, hash_map<tree, pair>> live_ranges_per_bb;
   machine_mode biggest_mode
-    = compute_local_live_ranges (program_points_per_bb, live_ranges_per_bb);
+    = compute_local_live_ranges (loop_vinfo, program_points_per_bb,
+				 live_ranges_per_bb);
 
   /* Update live ranges according to PHI.  */
   update_local_live_ranges (loop_vinfo, program_points_per_bb,
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
new file mode 100644
index 00000000000..0cb492e611c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c, int *restrict d,
+   int *restrict x, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] >> x[i];
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x[i];
+    }
+}
+
+void
+f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d,
+    int *restrict x, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] << x[i];
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x[i];
+    }
+}
+
+/* { dg-final { scan-assembler-times {e32,m4} 2 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-not {e32,m8} } } */
+/* { dg-final { scan-assembler-not {e32,m2} } } */
+/* { dg-final { scan-assembler-not {e32,m1} } } */
+/* { dg-final { scan-assembler-times {ret} 2 } } */
+/* { dg-final { scan-tree-dump-times "Preferring smaller LMUL loop because it has unexpected spills" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c
new file mode 100644
index 00000000000..0d42c3b27cb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] >> x;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x;
+    }
+}
+
+void
+f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
+    int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] << x;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x;
+    }
+}
+
+void
+f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] >> 17;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> 17;
+    }
+}
+
+void
+f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] << 17;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> 17;
+    }
+}
+
+/* { dg-final { scan-assembler-times {e32,m8} 4 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-not {e32,m4} } } */
+/* { dg-final { scan-assembler-not {e32,m2} } } */
+/* { dg-final { scan-assembler-not {e32,m1} } } */
+/* { dg-final { scan-assembler-times {ret} 4 } } */
+/* { dg-final { scan-tree-dump-not "Preferring smaller LMUL loop because it has unexpected spills" "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 4 "vect" } } */
-- 
2.36.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
  2024-01-05  1:53 [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL] Juzhe-Zhong
@ 2024-01-05  9:54 ` Robin Dapp
  2024-01-06  2:13   ` 钟居哲
  0 siblings, 1 reply; 5+ messages in thread
From: Robin Dapp @ 2024-01-05  9:54 UTC (permalink / raw)
  To: Juzhe-Zhong, gcc-patches; +Cc: rdapp.gcc, kito.cheng, kito.cheng, jeffreyalaw

> 1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift with vector shift amount,
> that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
> 2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize shift with scalar shift amount,
> that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.
> 

> +static bool
> +loop_invariant_op_p (class loop *loop,
> +		     tree op)
> +{
> +  if (is_gimple_min_invariant (op))
> +    return true;
> +  if (SSA_NAME_IS_DEFAULT_DEF (op)
> +      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op))))
> +    return true;
> +  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
> +}
> +

Looks like this is straight from tree-ssa-loop-ch.  Do we need
is_gimple_min_invariant (is_gimple_constant could be sufficient?)
and DEFAULT_DEF for our case?  The rhs of a shift should never contain
a default def?

I'm not entirely happy about the "loop invariant" heuristic/proxy
of the shift amount being vectorizable.  That seems like something
that could bite us in the future in case we do slp-like vectorization
on loop-invariant (but varying) data.

As it helps for now and is not a correctness issue I'd still tend to
go forward with it.

Regards
 Robin

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
  2024-01-05  9:54 ` Robin Dapp
@ 2024-01-06  2:13   ` 钟居哲
  2024-01-08 16:45     ` Robin Dapp
  0 siblings, 1 reply; 5+ messages in thread
From: 钟居哲 @ 2024-01-06  2:13 UTC (permalink / raw)
  To: rdapp.gcc, gcc-patches; +Cc: rdapp.gcc, kito.cheng, kito.cheng, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 1670 bytes --]

Thanks Robin.

is_gimple_constant makes more senes. Committed with addressing your comments.



juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2024-01-05 17:54
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
> 1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift with vector shift amount,
> that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
> 2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize shift with scalar shift amount,
> that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.
> 
 
> +static bool
> +loop_invariant_op_p (class loop *loop,
> +      tree op)
> +{
> +  if (is_gimple_min_invariant (op))
> +    return true;
> +  if (SSA_NAME_IS_DEFAULT_DEF (op)
> +      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op))))
> +    return true;
> +  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
> +}
> +
 
Looks like this is straight from tree-ssa-loop-ch.  Do we need
is_gimple_min_invariant (is_gimple_constant could be sufficient?)
and DEFAULT_DEF for our case?  The rhs of a shift should never contain
a default def?
 
I'm not entirely happy about the "loop invariant" heuristic/proxy
of the shift amount being vectorizable.  That seems like something
that could bite us in the future in case we do slp-like vectorization
on loop-invariant (but varying) data.
 
As it helps for now and is not a correctness issue I'd still tend to
go forward with it.
 
Regards
Robin
 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
  2024-01-06  2:13   ` 钟居哲
@ 2024-01-08 16:45     ` Robin Dapp
  2024-01-09  1:25       ` juzhe.zhong
  0 siblings, 1 reply; 5+ messages in thread
From: Robin Dapp @ 2024-01-08 16:45 UTC (permalink / raw)
  To: 钟居哲, gcc-patches
  Cc: rdapp.gcc, kito.cheng, kito.cheng, Jeff Law

>     > +  if (is_gimple_min_invariant (op))
>     > +    return true;
>     > +  if (SSA_NAME_IS_DEFAULT_DEF (op)
>     > +      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op))))
>     > +    return true;
>     > +  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
>     > +}
>     > +

Does gimple_uid ever return something useful for us here?
In tree-ssa-loop-ch it is being populated
before and then used but I don't think we populate it properly?

So my question would be, isn't is_gimple_constant and
flow_bb_inside_loop_p sufficient for our purpose?

Regards
 Robin

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
  2024-01-08 16:45     ` Robin Dapp
@ 2024-01-09  1:25       ` juzhe.zhong
  0 siblings, 0 replies; 5+ messages in thread
From: juzhe.zhong @ 2024-01-09  1:25 UTC (permalink / raw)
  To: Robin Dapp, gcc-patches; +Cc: Robin Dapp, kito.cheng, Kito.cheng, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 975 bytes --]

Yes. It does sufficient. Send a patch:
https://gcc.gnu.org/pipermail/gcc-patches/2024-January/642216.html 




juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2024-01-09 00:45
To: 钟居哲; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; Jeff Law
Subject: Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
>     > +  if (is_gimple_min_invariant (op))
>     > +    return true;
>     > +  if (SSA_NAME_IS_DEFAULT_DEF (op)
>     > +      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op))))
>     > +    return true;
>     > +  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
>     > +}
>     > +
 
Does gimple_uid ever return something useful for us here?
In tree-ssa-loop-ch it is being populated
before and then used but I don't think we populate it properly?
 
So my question would be, isn't is_gimple_constant and
flow_bb_inside_loop_p sufficient for our purpose?
 
Regards
Robin
 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-01-09  1:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-05  1:53 [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL] Juzhe-Zhong
2024-01-05  9:54 ` Robin Dapp
2024-01-06  2:13   ` 钟居哲
2024-01-08 16:45     ` Robin Dapp
2024-01-09  1:25       ` juzhe.zhong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).