[PATCH V2] RISC-V: Adjust scalar_to

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH V2] RISC-V: Adjust scalar_to_vec cost accurately
@ 2024-01-11 13:49 Juzhe-Zhong
  2024-01-11 21:22 ` Robin Dapp
  0 siblings, 1 reply; 2+ messages in thread
From: Juzhe-Zhong @ 2024-01-11 13:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, kito.cheng, jeffreyalaw, rdapp.gcc, Juzhe-Zhong

1. This patch set scalar_to_vec cost as 2 instead 1 since scalar move
   instruction is slightly more costly than normal rvv instructions (e.g. vadd.vv).

2. Adjust scalar_to_vec cost accurately according to the splat value, for example,
   a value like 32872, needs 2 more scalar instructions:
   so the cost = 2 (scalar instructions) + 2 (scalar move).
   We adjust the cost like this since it doesn need such many instructions in vectorized codes,
   wheras they are not needed in scalar codes.

After this patch, no matter -march=rv64gcv_zvl256b or -march=rv64gcv_zvl4096b.
We have optimal codgen:

	lui	a5,%hi(a)
	li	a4,19
	sb	a4,%lo(a)(a5)
	li	a0,0
	ret

	PR target/113281

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Adjust scalar_to_vec cost accurately.
	(costs::add_stmt_cost): Ditto.
	* config/riscv/riscv.cc: Ditto.
	* config/riscv/t-riscv: Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test.
	* gcc.target/riscv/rvv/autovec/zve32f-1.c: Ditto.
	* gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc        | 50 ++++++++++++++++++-
 gcc/config/riscv/riscv.cc                     |  4 +-
 gcc/config/riscv/t-riscv                      |  2 +-
 .../vect/costmodel/riscv/rvv/pr113281-1.c     | 18 +++++++
 .../vect/costmodel/riscv/rvv/pr113281-2.c     | 18 +++++++
 .../gcc.target/riscv/rvv/autovec/pr113209.c   |  2 +-
 .../gcc.target/riscv/rvv/autovec/zve32f-1.c   |  2 +-
 7 files changed, 90 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 58ec0b9b503..fc377435e53 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backend.h"
 #include "tree-data-ref.h"
 #include "tree-ssa-loop-niter.h"
+#include "emit-rtl.h"
 
 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@@ -1055,6 +1056,50 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
   return vector_costs::better_main_loop_than_p (other);
 }
 
+/* Adjust vectorization cost after calling
+   targetm.vectorize.builtin_vectorization_cost. For some statement, we would
+   like to further fine-grain tweak the cost on top of
+   targetm.vectorize.builtin_vectorization_cost handling which doesn't have any
+   information on statement operation codes etc.  */
+
+static unsigned
+adjust_stmt_cost (enum vect_cost_for_stmt kind,
+		  struct _stmt_vec_info *stmt_info, int count, int stmt_cost)
+{
+  gimple *stmt = stmt_info->stmt;
+  switch (kind)
+    {
+      case scalar_to_vec: {
+	stmt_cost *= count;
+	gcall *call = dyn_cast<gcall *> (stmt);
+	/* Adjust cost by counting the scalar value initialization.  */
+	unsigned int num
+	  = call ? gimple_call_num_args (call) : gimple_num_ops (stmt);
+	unsigned int start = call ? 0 : 1;
+
+	for (unsigned int i = start; i < num; i++)
+	  {
+	    tree op = call ? gimple_call_arg (call, i) : gimple_op (stmt, i);
+	    if (TREE_CODE (op) == INTEGER_CST)
+	      {
+		HOST_WIDE_INT value = tree_fits_shwi_p (op) ? tree_to_shwi (op)
+							    : tree_to_uhwi (op);
+		/* We don't need to count scalar costs if it
+		   is in range of [-16, 15] since we can use
+		   vmv.v.i.  */
+		if (!IN_RANGE (value, -16, 15))
+		  stmt_cost += riscv_const_insns (gen_int_mode (value, Pmode));
+	      }
+	    /* TODO: We don't count CONST_POLY_INT value for now.  */
+	  }
+	return stmt_cost;
+      }
+    default:
+      break;
+    }
+  return count * stmt_cost;
+}
+
 unsigned
 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 		      stmt_vec_info stmt_info, slp_tree, tree vectype,
@@ -1082,9 +1127,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	 as one iteration of the VLA loop.  */
       if (where == vect_body && m_unrolled_vls_niters)
 	m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
+
+      if (vectype)
+	stmt_cost = adjust_stmt_cost (kind, stmt_info, count, stmt_cost);
     }
 
-  return record_stmt_cost (stmt_info, where, count * stmt_cost);
+  return record_stmt_cost (stmt_info, where, stmt_cost);
 }
 
 void
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index df9799d9c5e..a14fb36817a 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -366,7 +366,7 @@ static const common_vector_cost rvv_vls_vector_cost = {
   1, /* gather_load_cost  */
   1, /* scatter_store_cost  */
   1, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
+  2, /* scalar_to_vec_cost  */
   1, /* permute_cost  */
   1, /* align_load_cost  */
   1, /* align_store_cost  */
@@ -382,7 +382,7 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
     1, /* gather_load_cost  */
     1, /* scatter_store_cost  */
     1, /* vec_to_scalar_cost  */
-    1, /* scalar_to_vec_cost  */
+    2, /* scalar_to_vec_cost  */
     1, /* permute_cost  */
     1, /* align_load_cost  */
     1, /* align_store_cost  */
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 32de6b851c1..fb2bf1c155f 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -73,7 +73,7 @@ riscv-vector-costs.o: $(srcdir)/config/riscv/riscv-vector-costs.cc \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TARGET_H) $(FUNCTION_H) \
   $(TREE_H) basic-block.h $(RTL_H) gimple.h targhooks.h cfgloop.h \
   fold-const.h $(TM_P_H) tree-vectorizer.h gimple-iterator.h bitmap.h \
-  ssa.h backend.h tree-data-ref.h tree-ssa-loop-niter.h \
+  ssa.h backend.h tree-data-ref.h tree-ssa-loop-niter.h emit-rtl.h \
   $(srcdir)/config/riscv/riscv-vector-costs.h
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/riscv/riscv-vector-costs.cc
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
new file mode 100644
index 00000000000..fdf6ed0334b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -ftree-vectorize" } */
+
+unsigned char a;
+
+int main() {
+  short b = a = 0;
+  for (; a != 19; a++)
+    if (a)
+      b = 32872 >> a;
+
+  if (b == 0)
+    return 0;
+  else
+    return 1;
+}
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c
new file mode 100644
index 00000000000..706e19116c9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=m8" } */
+
+unsigned char a;
+
+int main() {
+  short b = a = 0;
+  for (; a != 19; a++)
+    if (a)
+      b = 32872 >> a;
+
+  if (b == 0)
+    return 0;
+  else
+    return 1;
+}
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c
index 081ee369394..70aae151000 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -fno-vect-cost-model" } */
 
 int b, c, d, f, i, a;
 int e[1] = {0};
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f-1.c
index ab57e89b1cd..3a00327dfed 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/zve32f-1.c
@@ -3,4 +3,4 @@
 
 #include "template-1.h"
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */
-- 
2.36.3


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH V2] RISC-V: Adjust scalar_to_vec cost accurately
  2024-01-11 13:49 [PATCH V2] RISC-V: Adjust scalar_to_vec cost accurately Juzhe-Zhong
@ 2024-01-11 21:22 ` Robin Dapp
  0 siblings, 0 replies; 2+ messages in thread
From: Robin Dapp @ 2024-01-11 21:22 UTC (permalink / raw)
  To: Juzhe-Zhong, gcc-patches; +Cc: rdapp.gcc, kito.cheng, kito.cheng, jeffreyalaw

> 1. This patch set scalar_to_vec cost as 2 instead 1 since scalar move
>    instruction is slightly more costly than normal rvv instructions (e.g. vadd.vv).

We can go with 2 or 3 (if needed) for now but should later
really incorporate reg-move costs in this IMHO.  Just like e.g.

static const struct cpu_regmove_cost cortexa57_regmove_cost =
{
  1, /* GP2GP  */
  /* Avoid the use of slow int<->fp moves for spilling by setting
     their cost higher than memmov_cost.  */
  5, /* GP2FP  */
  ...
};

we can add V2FP, V2GP and the reverse.  Then add those to
scalar_to_vec (later vec_to_scalar as well) in adjust_stmt_cost
according to the mode.

> 2. Adjust scalar_to_vec cost accurately according to the splat value, for example,
>    a value like 32872, needs 2 more scalar instructions:
>    so the cost = 2 (scalar instructions) + 2 (scalar move).

>    We adjust the cost like this since it doesn need such many instructions in vectorized codes,
>    wheras they are not needed in scalar codes.

I'm afraid the issue I mentioned (we don't count the constant
synthesis for scalar but would for vector with the change) is
still present.
Even if it does not cause any regressions or problems now it
certainly might in the future, especially with complex constants.
Basically we would not vectorize something containing several
synthesized constants (like popcount) anymore.
Therefore I would advise against it even though the given
example cannot be "solved" unconditionally then.

Regards
 Robin

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2024-01-11 21:22 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-11 13:49 [PATCH V2] RISC-V: Adjust scalar_to_vec cost accurately Juzhe-Zhong
2024-01-11 21:22 ` Robin Dapp

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).