public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Vect: use a small step to calculate induction for the unrolled loop (PR tree-optimization/110449)
@ 2023-07-05  6:43 Hao Liu OS
  2023-07-06 12:44 ` Richard Biener
  0 siblings, 1 reply; 8+ messages in thread
From: Hao Liu OS @ 2023-07-05  6:43 UTC (permalink / raw)
  To: GCC-patches@gcc.gnu.org

Hi,

If a loop is unrolled by n times during vectoriation, two steps are used to
calculate the induction variable:
  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)

This patch calculates an extra vec_n to replace vec_loop:
  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.

So that we can save the large step register and related operations.

gcc/ChangeLog:

	PR tree-optimization/110449
	* tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
	vec_loop for the unrolled loop.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/pr110449.c: New testcase.
---
 gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
 gcc/tree-vect-loop.cc                       | 21 +++++++++--
 2 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c

diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
new file mode 100644
index 00000000000..bb3b6dcfe08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
+/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
+
+/* Calcualte the vectorized induction with smaller step for an unrolled loop.
+
+   before (suggested_unroll_factor=2):
+	  fmov    s30, 8.0e+0
+	  fmov    s31, 4.0e+0
+	  dup     v27.4s, v30.s[0]
+	  dup     v28.4s, v31.s[0]
+     .L6:
+	  mov     v30.16b, v31.16b
+	  fadd    v31.4s, v31.4s, v27.4s
+	  fadd    v29.4s, v30.4s, v28.4s
+	  stp     q30, q29, [x0]
+	  add     x0, x0, 32
+	  cmp     x1, x0
+	  bne     .L6
+
+   after:
+	  fmov    s31, 4.0e+0
+	  dup     v29.4s, v31.s[0]
+     .L6:
+	  fadd    v30.4s, v31.4s, v29.4s
+	  stp     q31, q30, [x0]
+	  add     x0, x0, 32
+	  fadd    v31.4s, v29.4s, v30.4s
+	  cmp     x0, x1
+	  bne     .L6  */
+
+void
+foo2 (float *arr, float freq, float step)
+{
+  for (int i = 0; i < 1024; i++)
+    {
+      arr[i] = freq;
+      freq += step;
+    }
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3b46c58a8d8..706ecbffd0c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 				   new_vec, step_vectype, NULL);
 
       vec_def = induc_def;
-      for (i = 1; i < ncopies; i++)
+      for (i = 1; i < ncopies + 1; i++)
 	{
 	  /* vec_i = vec_prev + vec_step  */
 	  gimple_seq stmts = NULL;
@@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
  
 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
-	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
-	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	  if (i < ncopies)
+	    {
+	      new_stmt = SSA_NAME_DEF_STMT (vec_def);
+	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	    }
+	  else
+	    {
+	      /* vec_1 = vec_iv + (VF/n * S)
+		 vec_2 = vec_1 + (VF/n * S)
+		 ...
+		 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
+
+		 vec_n is used as vec_loop to save the large step register and
+		 related operations.  */
+	      add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+			   UNKNOWN_LOCATION);
+	    }
 	}
     }
 
-- 
2.34.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-07-07 10:23 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-05  6:43 [PATCH] Vect: use a small step to calculate induction for the unrolled loop (PR tree-optimization/110449) Hao Liu OS
2023-07-06 12:44 ` Richard Biener
2023-07-06 16:06   ` Jeff Law
2023-07-07  2:23     ` Hao Liu OS
2023-07-06 17:50   ` Richard Sandiford
2023-07-07  5:32     ` Richard Biener
2023-07-07  7:53       ` Richard Sandiford
2023-07-07 10:23         ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).