public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
@ 2023-12-22  9:51 Juzhe-Zhong
  2023-12-22 16:58 ` Jeff Law
  0 siblings, 1 reply; 3+ messages in thread
From: Juzhe-Zhong @ 2023-12-22  9:51 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, kito.cheng, jeffreyalaw, rdapp.gcc, Juzhe-Zhong

Consider this following case:

foo:
        ble     a0,zero,.L11
        lui     a2,%hi(.LANCHOR0)
        addi    sp,sp,-128
        addi    a2,a2,%lo(.LANCHOR0)
        mv      a1,a0
        vsetvli a6,zero,e32,m8,ta,ma
        vid.v   v8
        vs8r.v  v8,0(sp)                     ---> spill
.L3:
        vl8re32.v       v16,0(sp)            ---> reload
        vsetvli a4,a1,e8,m2,ta,ma
        li      a3,0
        vsetvli a5,zero,e32,m8,ta,ma
        vmv8r.v v0,v16
        vmv.v.x v8,a4
        vmv.v.i v24,0
        vadd.vv v8,v16,v8
        vmv8r.v v16,v24
        vs8r.v  v8,0(sp)                    ---> spill
.L4:
        addiw   a3,a3,1
        vadd.vv v8,v0,v16
        vadd.vi v16,v16,1
        vadd.vv v24,v24,v8
        bne     a0,a3,.L4
        vsetvli zero,a4,e32,m8,ta,ma
        sub     a1,a1,a4
        vse32.v v24,0(a2)
        slli    a4,a4,2
        add     a2,a2,a4
        bne     a1,zero,.L3
        li      a0,0
        addi    sp,sp,128
        jr      ra
.L11:
        li      a0,0
        ret

Pick unexpected LMUL = 8.

The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation:

  # j_17 = PHI <j_11(9), 0(5)>                       ---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>

We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it.

This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4)

foo:
	ble	a0,zero,.L9
	lui	a4,%hi(.LANCHOR0)
	addi	a4,a4,%lo(.LANCHOR0)
	mv	a2,a0
	vsetivli	zero,16,e32,m4,ta,ma
	vid.v	v20
.L3:
	vsetvli	a3,a2,e8,m1,ta,ma
	li	a5,0
	vsetivli	zero,16,e32,m4,ta,ma
	vmv4r.v	v16,v20
	vmv.v.i	v12,0
	vmv.v.x	v4,a3
	vmv4r.v	v8,v12
	vadd.vv	v20,v20,v4
.L4:
	addiw	a5,a5,1
	vmv4r.v	v4,v8
	vadd.vi	v8,v8,1
	vadd.vv	v4,v16,v4
	vadd.vv	v12,v12,v4
	bne	a0,a5,.L4
	slli	a5,a3,2
	vsetvli	zero,a3,e32,m4,ta,ma
	sub	a2,a2,a3
	vse32.v	v12,0(a4)
	add	a4,a4,a5
	bne	a2,zero,.L3
.L9:
	li	a0,0
	ret

Tested on --with-arch=gcv no regression. Ok for trunk ?

	PR target/113112

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information.
	(preferred_new_lmul_p): Make PHI initial value into live regs calculation.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc        | 45 ++++++++++++++++---
 .../vect/costmodel/riscv/rvv/pr113112-1.c     | 31 +++++++++++++
 2 files changed, 71 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index a316603e207..2d4b82a643a 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -355,10 +355,11 @@ max_number_of_live_regs (const basic_block bb,
     }
 
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-		     "Maximum lmul = %d, %d number of live V_REG at program "
-		     "point %d for bb %d\n",
-		     lmul, max_nregs, live_point, bb->index);
+    dump_printf_loc (
+      MSG_NOTE, vect_location,
+      "Maximum lmul = %d, At most %d number of live V_REG at program "
+      "point %d for bb %d\n",
+      lmul, max_nregs, live_point, bb->index);
   return max_nregs;
 }
 
@@ -472,6 +473,41 @@ update_local_live_ranges (
 	      tree def = gimple_phi_arg_def (phi, j);
 	      auto *live_ranges = live_ranges_per_bb.get (bb);
 	      auto *live_range = live_ranges->get (def);
+	      if (poly_int_tree_p (def))
+		{
+		  /* Insert live range of INTEGER_CST since we will need to
+		     allocate a vector register for it.
+
+		     E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
+		     into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
+
+		     The live range for such value is short which only lives
+		     at program point 0.  */
+		  if (live_range)
+		    {
+		      unsigned int start = (*live_range).first;
+		      (*live_range).first = 0;
+		      if (dump_enabled_p ())
+			dump_printf_loc (
+			  MSG_NOTE, vect_location,
+			  "Update %T start point from %d to 0:\n", def, start);
+		    }
+		  else
+		    {
+		      live_ranges->put (def, pair (0, 1));
+		      auto &program_points = (*program_points_per_bb.get (bb));
+		      if (program_points.is_empty ())
+			{
+			  stmt_point info = {1, phi};
+			  program_points.safe_push (info);
+			}
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "Add %T start point from 0 to 1:\n",
+					 def);
+		    }
+		  continue;
+		}
 	      if (live_range && flow_bb_inside_loop_p (loop, e->src))
 		{
 		  unsigned int start = (*live_range).first;
@@ -580,7 +616,6 @@ preferred_new_lmul_p (loop_vec_info other_loop_vinfo)
 				       biggest_mode, lmul);
 	  if (nregs > max_nregs)
 	    max_nregs = nregs;
-	  live_ranges_per_bb.empty ();
 	}
       live_ranges_per_bb.empty ();
       return max_nregs > V_REG_NUM;
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
new file mode 100644
index 00000000000..a44a1c041af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic --param riscv-autovec-preference=fixed-vlmax -fdump-tree-vect-details" } */
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (int n){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < n; i++) {
+    sum = 0;
+    for (j = 0; j < n; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 1 } } */
+/* { dg-final { scan-tree-dump "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-tree-dump "Maximum lmul = 4" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
+/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program point 0 for bb 4" "vect" } } */
+/* { dg-final { scan-tree-dump "At most 40 number of live V_REG at program point 0 for bb 3" "vect" } } */
+/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program point 0 for bb 5" "vect" } } */
-- 
2.36.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
  2023-12-22  9:51 [PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis Juzhe-Zhong
@ 2023-12-22 16:58 ` Jeff Law
  2023-12-22 23:08   ` 钟居哲
  0 siblings, 1 reply; 3+ messages in thread
From: Jeff Law @ 2023-12-22 16:58 UTC (permalink / raw)
  To: Juzhe-Zhong, gcc-patches; +Cc: kito.cheng, kito.cheng, rdapp.gcc



On 12/22/23 02:51, Juzhe-Zhong wrote:
> Consider this following case:
> 
> foo:
>          ble     a0,zero,.L11
>          lui     a2,%hi(.LANCHOR0)
>          addi    sp,sp,-128
>          addi    a2,a2,%lo(.LANCHOR0)
>          mv      a1,a0
>          vsetvli a6,zero,e32,m8,ta,ma
>          vid.v   v8
>          vs8r.v  v8,0(sp)                     ---> spill
> .L3:
>          vl8re32.v       v16,0(sp)            ---> reload
>          vsetvli a4,a1,e8,m2,ta,ma
>          li      a3,0
>          vsetvli a5,zero,e32,m8,ta,ma
>          vmv8r.v v0,v16
>          vmv.v.x v8,a4
>          vmv.v.i v24,0
>          vadd.vv v8,v16,v8
>          vmv8r.v v16,v24
>          vs8r.v  v8,0(sp)                    ---> spill
> .L4:
>          addiw   a3,a3,1
>          vadd.vv v8,v0,v16
>          vadd.vi v16,v16,1
>          vadd.vv v24,v24,v8
>          bne     a0,a3,.L4
>          vsetvli zero,a4,e32,m8,ta,ma
>          sub     a1,a1,a4
>          vse32.v v24,0(a2)
>          slli    a4,a4,2
>          add     a2,a2,a4
>          bne     a1,zero,.L3
>          li      a0,0
>          addi    sp,sp,128
>          jr      ra
> .L11:
>          li      a0,0
>          ret
> 
> Pick unexpected LMUL = 8.
> 
> The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation:
> 
>    # j_17 = PHI <j_11(9), 0(5)>                       ---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>
> 
> We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it.
Yup.  There's analogues in the scalar space.  Depending on the context 
we might consider the value live on the edge, at the end of e->src or at 
the start of e->dest.

In the scalar space we commonly have multiple constant values and we try 
to account for them as best as we can as each distinct constant can 
result in a constant load.  We also try to find pseudos that happen to 
already have the value we want so that they participate in the 
coalescing process.  I doubt either of these cases are particularly 
important for vector though.


> 
> This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4)
> 
> foo:
> 	ble	a0,zero,.L9
> 	lui	a4,%hi(.LANCHOR0)
> 	addi	a4,a4,%lo(.LANCHOR0)
> 	mv	a2,a0
> 	vsetivli	zero,16,e32,m4,ta,ma
> 	vid.v	v20
> .L3:
> 	vsetvli	a3,a2,e8,m1,ta,ma
> 	li	a5,0
> 	vsetivli	zero,16,e32,m4,ta,ma
> 	vmv4r.v	v16,v20
> 	vmv.v.i	v12,0
> 	vmv.v.x	v4,a3
> 	vmv4r.v	v8,v12
> 	vadd.vv	v20,v20,v4
> .L4:
> 	addiw	a5,a5,1
> 	vmv4r.v	v4,v8
> 	vadd.vi	v8,v8,1
> 	vadd.vv	v4,v16,v4
> 	vadd.vv	v12,v12,v4
> 	bne	a0,a5,.L4
> 	slli	a5,a3,2
> 	vsetvli	zero,a3,e32,m4,ta,ma
> 	sub	a2,a2,a3
> 	vse32.v	v12,0(a4)
> 	add	a4,a4,a5
> 	bne	a2,zero,.L3
> .L9:
> 	li	a0,0
> 	ret
> 
> Tested on --with-arch=gcv no regression. Ok for trunk ?
> 
> 	PR target/113112
> 
> gcc/ChangeLog:
> 
> 	* config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information.
> 	(preferred_new_lmul_p): Make PHI initial value into live regs calculation.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.
OK assuming you've done the necessary regression testing.

jeff

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Re: [PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
  2023-12-22 16:58 ` Jeff Law
@ 2023-12-22 23:08   ` 钟居哲
  0 siblings, 0 replies; 3+ messages in thread
From: 钟居哲 @ 2023-12-22 23:08 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: kito.cheng, kito.cheng, rdapp.gcc

[-- Attachment #1: Type: text/plain, Size: 3734 bytes --]

Committed. Thanks Jeff.



juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-12-23 00:58
To: Juzhe-Zhong; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
 
 
On 12/22/23 02:51, Juzhe-Zhong wrote:
> Consider this following case:
> 
> foo:
>          ble     a0,zero,.L11
>          lui     a2,%hi(.LANCHOR0)
>          addi    sp,sp,-128
>          addi    a2,a2,%lo(.LANCHOR0)
>          mv      a1,a0
>          vsetvli a6,zero,e32,m8,ta,ma
>          vid.v   v8
>          vs8r.v  v8,0(sp)                     ---> spill
> .L3:
>          vl8re32.v       v16,0(sp)            ---> reload
>          vsetvli a4,a1,e8,m2,ta,ma
>          li      a3,0
>          vsetvli a5,zero,e32,m8,ta,ma
>          vmv8r.v v0,v16
>          vmv.v.x v8,a4
>          vmv.v.i v24,0
>          vadd.vv v8,v16,v8
>          vmv8r.v v16,v24
>          vs8r.v  v8,0(sp)                    ---> spill
> .L4:
>          addiw   a3,a3,1
>          vadd.vv v8,v0,v16
>          vadd.vi v16,v16,1
>          vadd.vv v24,v24,v8
>          bne     a0,a3,.L4
>          vsetvli zero,a4,e32,m8,ta,ma
>          sub     a1,a1,a4
>          vse32.v v24,0(a2)
>          slli    a4,a4,2
>          add     a2,a2,a4
>          bne     a1,zero,.L3
>          li      a0,0
>          addi    sp,sp,128
>          jr      ra
> .L11:
>          li      a0,0
>          ret
> 
> Pick unexpected LMUL = 8.
> 
> The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation:
> 
>    # j_17 = PHI <j_11(9), 0(5)>                       ---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>
> 
> We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it.
Yup.  There's analogues in the scalar space.  Depending on the context 
we might consider the value live on the edge, at the end of e->src or at 
the start of e->dest.
 
In the scalar space we commonly have multiple constant values and we try 
to account for them as best as we can as each distinct constant can 
result in a constant load.  We also try to find pseudos that happen to 
already have the value we want so that they participate in the 
coalescing process.  I doubt either of these cases are particularly 
important for vector though.
 
 
> 
> This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4)
> 
> foo:
> ble a0,zero,.L9
> lui a4,%hi(.LANCHOR0)
> addi a4,a4,%lo(.LANCHOR0)
> mv a2,a0
> vsetivli zero,16,e32,m4,ta,ma
> vid.v v20
> .L3:
> vsetvli a3,a2,e8,m1,ta,ma
> li a5,0
> vsetivli zero,16,e32,m4,ta,ma
> vmv4r.v v16,v20
> vmv.v.i v12,0
> vmv.v.x v4,a3
> vmv4r.v v8,v12
> vadd.vv v20,v20,v4
> .L4:
> addiw a5,a5,1
> vmv4r.v v4,v8
> vadd.vi v8,v8,1
> vadd.vv v4,v16,v4
> vadd.vv v12,v12,v4
> bne a0,a5,.L4
> slli a5,a3,2
> vsetvli zero,a3,e32,m4,ta,ma
> sub a2,a2,a3
> vse32.v v12,0(a4)
> add a4,a4,a5
> bne a2,zero,.L3
> .L9:
> li a0,0
> ret
> 
> Tested on --with-arch=gcv no regression. Ok for trunk ?
> 
> PR target/113112
> 
> gcc/ChangeLog:
> 
> * config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information.
> (preferred_new_lmul_p): Make PHI initial value into live regs calculation.
> 
> gcc/testsuite/ChangeLog:
> 
> * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.
OK assuming you've done the necessary regression testing.
 
jeff
 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-12-22 23:08 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-22  9:51 [PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis Juzhe-Zhong
2023-12-22 16:58 ` Jeff Law
2023-12-22 23:08   ` 钟居哲

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).