public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH]AArch64 RFC: Don't cost all scalar operations during vectorization if scalar will fuse
@ 2021-08-31 13:27 Tamar Christina
  2021-08-31 14:49 ` Richard Sandiford
  0 siblings, 1 reply; 4+ messages in thread
From: Tamar Christina @ 2021-08-31 13:27 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 6220 bytes --]

Hi All,

As the vectorizer has improved over time in capabilities it has started
over-vectorizing.  This has causes regressions in the order of 1-7x on libraries
that Arm produces.

The vector costs actually do make a lot of sense and I don't think that they are
wrong.  I think that the costs for the scalar code are wrong.

In particular the costing doesn't take into effect that scalar operation
can/will fuse as this happens in RTL.  Because of this the costs for the scalars
end up being always higher.

As an example the loop in PR 97984:

void x (long * __restrict a, long * __restrict b)
{
  a[0] *= b[0];
  a[1] *= b[1];
  a[0] += b[0];
  a[1] += b[1];
}

generates:

x:
        ldp     x2, x3, [x0]
        ldr     x4, [x1]
        ldr     q1, [x1]
        mul     x2, x2, x4
        ldr     x4, [x1, 8]
        fmov    d0, x2
        ins     v0.d[1], x3
        mul     x1, x3, x4
        ins     v0.d[1], x1
        add     v0.2d, v0.2d, v1.2d
        str     q0, [x0]
        ret

On an actual loop the prologue costs would make the loop too expensive so we
produce the scalar output, but with SLP there's no loop overhead costs so we end
up trying to vectorize this. Because SLP discovery is started from the stores we
will end up vectorizing and costing the add but not the MUL.

To counter this the patch adjusts the costing when it finds an operation that
can be fused and discounts the cost of the "other" operation being fused in.

The attached testcase shows that even when we discount it we still get still get
vectorized code when profitable to do so, e.g. SVE.

This happens as well with other operations such as scalar operations where
shifts can be fused in or for e.g. bfxil.  As such sending this for feedback.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? If the approach is acceptable I can add support for more.

Thanks,
Tamar

gcc/ChangeLog:

	PR target/97984
	* config/aarch64/aarch64.c (aarch64_add_stmt_cost): Check for fusing
	madd.

gcc/testsuite/ChangeLog:

	PR target/97984
	* gcc.target/aarch64/pr97984-1.c: New test.
	* gcc.target/aarch64/pr97984-2.c: New test.
	* gcc.target/aarch64/pr97984-3.c: New test.
	* gcc.target/aarch64/pr97984-4.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4cd4b037f2606e515ad8f4669d2cd13a509dd0a4..329b556311310d86aaf546d7b395a3750a9d57d4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15536,6 +15536,39 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
 						  vectype, stmt_cost);
 
+      /* Scale costs if operation is fusing.  */
+      if (stmt_info && kind == scalar_stmt)
+      {
+	if (gassign *stmt = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)))
+	  {
+	    switch (gimple_assign_rhs_code (stmt))
+	    {
+	    case PLUS_EXPR:
+	    case MINUS_EXPR:
+	      {
+		/* Check if operation can fuse into MSUB or MADD.  */
+		tree rhs1 = gimple_assign_rhs1 (stmt);
+		if (gassign *stmt1 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs1)))
+		  if (gimple_assign_rhs_code (stmt1) == MULT_EXPR)
+		    {
+		      stmt_cost = 0;
+		      break;
+		   }
+		tree rhs2 = gimple_assign_rhs2 (stmt);
+		if (gassign *stmt2 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs2)))
+		  if (gimple_assign_rhs_code (stmt2) == MULT_EXPR)
+		    {
+		      stmt_cost = 0;
+		      break;
+		    }
+	      }
+	      break;
+	    default:
+	      break;
+	    }
+	  }
+      }
+
       if (stmt_info && aarch64_use_new_vector_costs_p ())
 	{
 	  /* Account for any extra "embedded" costs that apply additively
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-1.c b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..9d403eb76ec3a72747f47e718a88ed6b062643f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
+
+void x (long * __restrict a, long * __restrict b)
+{
+  a[0] *= b[0];
+  a[1] *= b[1];
+  a[0] += b[0];
+  a[1] += b[1];
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-2.c b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a4086380fd613035f7ce3e8e8c89e853efa1304e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all" } */
+
+void x (long * __restrict a, long * __restrict b, int n)
+{
+  for (int i = 0; i < n; i++) {
+    a[i] *= b[i];
+    a[i] += b[i];
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-3.c b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..c6c8d351834705998b3c87a40edf1a00a4bb80f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
+
+void x (long * __restrict a, long * __restrict b, long * __restrict c)
+{
+  a[0] *= b[0];
+  a[1] *= b[1];
+  a[0] += c[0];
+  a[1] += c[1];
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-4.c b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..d03b0bb84dd822ab3b61e229c01be4cd1fc7f1d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all -march=armv8.3-a+sve" } */
+
+void x (long * __restrict a, long * __restrict b, int n)
+{
+  for (int i = 0; i < n; i++) {
+    a[i] *= b[i];
+    a[i] += b[i];
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */


-- 

[-- Attachment #2: rb14784.patch --]
[-- Type: text/x-diff, Size: 3958 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4cd4b037f2606e515ad8f4669d2cd13a509dd0a4..329b556311310d86aaf546d7b395a3750a9d57d4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15536,6 +15536,39 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
 						  vectype, stmt_cost);
 
+      /* Scale costs if operation is fusing.  */
+      if (stmt_info && kind == scalar_stmt)
+      {
+	if (gassign *stmt = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)))
+	  {
+	    switch (gimple_assign_rhs_code (stmt))
+	    {
+	    case PLUS_EXPR:
+	    case MINUS_EXPR:
+	      {
+		/* Check if operation can fuse into MSUB or MADD.  */
+		tree rhs1 = gimple_assign_rhs1 (stmt);
+		if (gassign *stmt1 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs1)))
+		  if (gimple_assign_rhs_code (stmt1) == MULT_EXPR)
+		    {
+		      stmt_cost = 0;
+		      break;
+		   }
+		tree rhs2 = gimple_assign_rhs2 (stmt);
+		if (gassign *stmt2 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs2)))
+		  if (gimple_assign_rhs_code (stmt2) == MULT_EXPR)
+		    {
+		      stmt_cost = 0;
+		      break;
+		    }
+	      }
+	      break;
+	    default:
+	      break;
+	    }
+	  }
+      }
+
       if (stmt_info && aarch64_use_new_vector_costs_p ())
 	{
 	  /* Account for any extra "embedded" costs that apply additively
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-1.c b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..9d403eb76ec3a72747f47e718a88ed6b062643f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
+
+void x (long * __restrict a, long * __restrict b)
+{
+  a[0] *= b[0];
+  a[1] *= b[1];
+  a[0] += b[0];
+  a[1] += b[1];
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-2.c b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a4086380fd613035f7ce3e8e8c89e853efa1304e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all" } */
+
+void x (long * __restrict a, long * __restrict b, int n)
+{
+  for (int i = 0; i < n; i++) {
+    a[i] *= b[i];
+    a[i] += b[i];
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-3.c b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..c6c8d351834705998b3c87a40edf1a00a4bb80f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
+
+void x (long * __restrict a, long * __restrict b, long * __restrict c)
+{
+  a[0] *= b[0];
+  a[1] *= b[1];
+  a[0] += c[0];
+  a[1] += c[1];
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-4.c b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..d03b0bb84dd822ab3b61e229c01be4cd1fc7f1d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all -march=armv8.3-a+sve" } */
+
+void x (long * __restrict a, long * __restrict b, int n)
+{
+  for (int i = 0; i < n; i++) {
+    a[i] *= b[i];
+    a[i] += b[i];
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-09-02 12:18 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-31 13:27 [PATCH]AArch64 RFC: Don't cost all scalar operations during vectorization if scalar will fuse Tamar Christina
2021-08-31 14:49 ` Richard Sandiford
2021-09-01 13:47   ` Richard Biener
2021-09-02 12:18     ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).