[PATCH] Don't reduce estimated unrolled size for innermost loop.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Don't reduce estimated unrolled size for innermost loop.
@ 2024-05-13  2:27 liuhongt
  2024-05-13  7:40 ` Richard Biener
  0 siblings, 1 reply; 12+ messages in thread
From: liuhongt @ 2024-05-13  2:27 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther

As testcase in the PR, O3 cunrolli may prevent vectorization for the
innermost loop and increase register pressure.
The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
the reduction.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big impact for SPEC2017.
Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/112325
	* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
	new parameters: loop and ul, and remove unr_insns reduction
	for innermost loop.
	(try_unroll_loop_completely): Pass loop and ul to
	estimated_unrolled_size.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/pr112325.c: New test.
	* gcc.dg/vect/pr69783.c: Add extra option --param
	max-completely-peeled-insns=300.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
 3 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
new file mode 100644
index 00000000000..14208b3e7f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    unsigned short s;
+    __builtin_memcpy(&s, &f, sizeof(unsigned short));
+    return table_f32_f16[s];
+}
+
+typedef struct {
+    ggml_fp16_t d;
+    ggml_fp16_t m;
+    unsigned char qh[4];
+    unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+    float d;
+    float s;
+    char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = 32;
+    const int nb = n / qk;
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        unsigned qh;
+        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
index 5df95d0ce4e..a1f75514d72 100644
--- a/gcc/testsuite/gcc.dg/vect/pr69783.c
+++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_float } */
-/* { dg-additional-options "-Ofast -funroll-loops" } */
+/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
 
 #define NXX 516
 #define NYY 516
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..5e0eca647a1 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
-			 unsigned HOST_WIDE_INT nunroll)
+			 unsigned HOST_WIDE_INT nunroll,
+			 enum unroll_level ul,
+			 class loop* loop)
 {
   HOST_WIDE_INT unr_insns = ((nunroll)
   			     * (HOST_WIDE_INT) (size->overall
@@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
     unr_insns = 0;
   unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
 
-  unr_insns = unr_insns * 2 / 3;
+  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
+     and may increase a lot of register pressure.
+     UL != UL_ALL is need to unroll small loop at O2.  */
+  class loop *loop_father = loop_outer (loop);
+  if (loop->inner || !loop_father
+      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
+      || ul != UL_ALL)
+    unr_insns = unr_insns * 2 / 3;
+
   if (unr_insns <= 0)
     unr_insns = 1;
 
@@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
 
 	  unsigned HOST_WIDE_INT ninsns = size.overall;
 	  unsigned HOST_WIDE_INT unr_insns
-	    = estimated_unrolled_size (&size, n_unroll);
+	    = estimated_unrolled_size (&size, n_unroll, ul, loop);
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    {
 	      fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-13  2:27 [PATCH] Don't reduce estimated unrolled size for innermost loop liuhongt
@ 2024-05-13  7:40 ` Richard Biener
  2024-05-15  2:14   ` Hongtao Liu
  0 siblings, 1 reply; 12+ messages in thread
From: Richard Biener @ 2024-05-13  7:40 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 7193 bytes --]

On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> As testcase in the PR, O3 cunrolli may prevent vectorization for the
> innermost loop and increase register pressure.
> The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> the reduction.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> No big impact for SPEC2017.
> Ok for trunk?

This removes the 1/3 reduction when unrolling a loop nest (the case I was
concerned about).  Unrolling of a nest is by iterating in
tree_unroll_loops_completely
so the to be unrolled loop appears innermost.  So I think you need a new
parameter on tree_unroll_loops_completely_1 indicating whether we're in the
first iteration (or whether to assume inner most loops will "simplify").

Few comments below

> gcc/ChangeLog:
>
>         PR tree-optimization/112325
>         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
>         new parameters: loop and ul, and remove unr_insns reduction
>         for innermost loop.
>         (try_unroll_loop_completely): Pass loop and ul to
>         estimated_unrolled_size.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/pr112325.c: New test.
>         * gcc.dg/vect/pr69783.c: Add extra option --param
>         max-completely-peeled-insns=300.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
>  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
>  3 files changed, 71 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> new file mode 100644
> index 00000000000..14208b3e7f8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> +
> +typedef unsigned short ggml_fp16_t;
> +static float table_f32_f16[1 << 16];
> +
> +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> +    unsigned short s;
> +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> +    return table_f32_f16[s];
> +}
> +
> +typedef struct {
> +    ggml_fp16_t d;
> +    ggml_fp16_t m;
> +    unsigned char qh[4];
> +    unsigned char qs[32 / 2];
> +} block_q5_1;
> +
> +typedef struct {
> +    float d;
> +    float s;
> +    char qs[32];
> +} block_q8_1;
> +
> +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> +    const int qk = 32;
> +    const int nb = n / qk;
> +
> +    const block_q5_1 * restrict x = vx;
> +    const block_q8_1 * restrict y = vy;
> +
> +    float sumf = 0.0;
> +
> +    for (int i = 0; i < nb; i++) {
> +        unsigned qh;
> +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> +
> +        int sumi = 0;
> +
> +        for (int j = 0; j < qk/2; ++j) {
> +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> +
> +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> +
> +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> +        }
> +
> +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> +    }
> +
> +    *s = sumf;
> +}
> +
> +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> index 5df95d0ce4e..a1f75514d72 100644
> --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-require-effective-target vect_float } */
> -/* { dg-additional-options "-Ofast -funroll-loops" } */
> +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */

If we rely on unrolling of a loop can you put #pragma unroll [N]
before the respective loop
instead?

>  #define NXX 516
>  #define NYY 516
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index bf017137260..5e0eca647a1 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
>
>  static unsigned HOST_WIDE_INT
>  estimated_unrolled_size (struct loop_size *size,
> -                        unsigned HOST_WIDE_INT nunroll)
> +                        unsigned HOST_WIDE_INT nunroll,
> +                        enum unroll_level ul,
> +                        class loop* loop)
>  {
>    HOST_WIDE_INT unr_insns = ((nunroll)
>                              * (HOST_WIDE_INT) (size->overall
> @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
>      unr_insns = 0;
>    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
>
> -  unr_insns = unr_insns * 2 / 3;
> +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> +     and may increase a lot of register pressure.
> +     UL != UL_ALL is need to unroll small loop at O2.  */
> +  class loop *loop_father = loop_outer (loop);
> +  if (loop->inner || !loop_father

Do we ever get here for !loop_father?  We shouldn't.

> +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)

This means you excempt all loops that are direct children of the loop
root tree.  That doesn't make much sense.

> +      || ul != UL_ALL)

This is also quite odd - we're being more optimistic for UL_NO_GROWTH
than for UL_ALL?  This doesn't make much sense.

Overall I think this means removal of being optimistic doesn't work so well?

If we need some extra leeway for UL_NO_GROWTH for what we expect
to unroll it might be better to add sth like --param
nogrowth-completely-peeled-insns
specifying a fixed surplus size?  Or we need to look at what's the problem
with the testcases regressing or the one you are trying to fix.

I did experiment with better estimating cleanup done at some point
(see attached),
but didn't get to finishing that (and as said, as we're running VN on the result
we'd ideally do that as part of the estimation somehow).

Richard.

> +    unr_insns = unr_insns * 2 / 3;
> +
>    if (unr_insns <= 0)
>      unr_insns = 1;
>
> @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
>
>           unsigned HOST_WIDE_INT ninsns = size.overall;
>           unsigned HOST_WIDE_INT unr_insns
> -           = estimated_unrolled_size (&size, n_unroll);
> +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
>           if (dump_file && (dump_flags & TDF_DETAILS))
>             {
>               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> --
> 2.31.1
>

[-- Attachment #2: p --]
[-- Type: application/octet-stream, Size: 9048 bytes --]

From 009002633e3acce6031094c8d1910b7d99314f9f Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Mon, 14 Aug 2023 12:02:41 +0200
Subject: [PATCH] test unroll
To: gcc-patches@gcc.gnu.org

---
 .../gcc.dg/fstack-protector-strong.c          |   4 +-
 gcc/tree-ssa-loop-ivcanon.cc                  | 158 ++++++++++++------
 2 files changed, 113 insertions(+), 49 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/fstack-protector-strong.c b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
index 94dc3508f1a..fafa1917449 100644
--- a/gcc/testsuite/gcc.dg/fstack-protector-strong.c
+++ b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
@@ -28,7 +28,7 @@ foo1 ()
 struct ArrayStruct
 {
   int a;
-  int array[10];
+  int array[18];
 };
 
 struct AA
@@ -43,7 +43,7 @@ foo2 ()
 {
   struct AA aa;
   int i;
-  for (i = 0; i < 10; ++i)
+  for (i = 0; i < 18; ++i)
     {
       aa.as.array[i] = i * (i-1) + i / 2;
     }
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..58e4168c209 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -159,6 +159,7 @@ struct loop_size
   int num_branches_on_hot_path;
 };
 
+#if 0
 /* Return true if OP in STMT will be constant after peeling LOOP.  */
 
 static bool
@@ -246,6 +247,7 @@ constant_after_peeling (tree op, gimple *stmt, class loop *loop)
     }
   return true;
 }
+#endif
 
 /* Computes an estimated number of insns in LOOP.
    EXIT (if non-NULL) is an exite edge that will be eliminated in all but last
@@ -277,6 +279,31 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "Estimating sizes for loop %i\n", loop->num);
+
+  static hash_map<tree, tree> *vals;
+  vals = new hash_map<tree, tree>;
+  edge pe = loop_preheader_edge (loop);
+  for (auto si = gsi_start_phis (loop->header);
+       !gsi_end_p (si); gsi_next (&si))
+    {
+      if (virtual_operand_p (gimple_phi_result (*si)))
+	continue;
+      tree val = gimple_phi_arg_def_from_edge (*si, pe);
+      if (CONSTANT_CLASS_P (val))
+	{
+	  vals->put (gimple_phi_result (*si), val);
+	  tree ev = analyze_scalar_evolution (loop, gimple_phi_result (*si));
+	  if (!chrec_contains_undetermined (ev)
+	      && !chrec_contains_symbols (ev))
+	    size->constant_iv = true;
+	}
+    }
+
+  auto els_valueize = [] (tree op) -> tree
+    { if (tree *val = vals->get (op)) return *val; return op; };
+
+  auto process_loop = [&] () -> bool
+    {
   for (i = 0; i < loop->num_nodes; i++)
     {
       if (edge_to_cancel && body[i] != edge_to_cancel->src
@@ -323,57 +350,51 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 			     "in last copy.\n");
 		  likely_eliminated_last = true;
 		}
-	      /* Sets of IV variables  */
-	      if (gimple_code (stmt) == GIMPLE_ASSIGN
-		  && constant_after_peeling (gimple_assign_lhs (stmt), stmt, loop))
+	      /* Stores are not eliminated.  */
+	      if (gimple_vdef (stmt))
+		goto account;
+	      /* Below we are using constant folding to decide whether
+		 we can elide a stmt.  While for the first iteration we
+		 could use the actual value for the rest we have to
+		 avoid the situation re-using a * 1 or + 0 operand, so
+		 require all SSA operands to be constants here.  */
+	      bool fail = false;
+	      ssa_op_iter iter;
+	      use_operand_p use_p;
+	      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
+		if (!vals->get (USE_FROM_PTR (use_p)))
+		  {
+		    fail = true;
+		    break;
+		  }
+	      if (fail)
+		goto account;
+	      tree val;
+	      /* Switches are not handled by folding.  */
+	      if (gimple_code (stmt) == GIMPLE_SWITCH
+		  && ! is_gimple_min_invariant
+		  (gimple_switch_index (as_a <gswitch *> (stmt)))
+		  && vals->get (gimple_switch_index
+				(as_a <gswitch *> (stmt))))
 		{
 		  if (dump_file && (dump_flags & TDF_DETAILS))
-		    fprintf (dump_file, "   Induction variable computation will"
-			     " be folded away.\n");
+		    fprintf (dump_file, "   Constant conditional.\n");
 		  likely_eliminated = true;
 		}
-	      /* Assignments of IV variables.  */
-	      else if (gimple_code (stmt) == GIMPLE_ASSIGN
-		       && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
-		       && constant_after_peeling (gimple_assign_rhs1 (stmt),
-						  stmt, loop)
-		       && (gimple_assign_rhs_class (stmt) != GIMPLE_BINARY_RHS
-			   || constant_after_peeling (gimple_assign_rhs2 (stmt),
-						      stmt, loop))
-		       && gimple_assign_rhs_class (stmt) != GIMPLE_TERNARY_RHS)
+	      else if ((val = gimple_fold_stmt_to_constant (stmt, els_valueize))
+		       && CONSTANT_CLASS_P (val))
 		{
-		  size->constant_iv = true;
 		  if (dump_file && (dump_flags & TDF_DETAILS))
 		    fprintf (dump_file,
 			     "   Constant expression will be folded away.\n");
 		  likely_eliminated = true;
-		}
-	      /* Conditionals.  */
-	      else if ((gimple_code (stmt) == GIMPLE_COND
-			&& constant_after_peeling (gimple_cond_lhs (stmt), stmt,
-						   loop)
-			&& constant_after_peeling (gimple_cond_rhs (stmt), stmt,
-						   loop)
-			/* We don't simplify all constant compares so make sure
-			   they are not both constant already.  See PR70288.  */
-			&& (! is_gimple_min_invariant (gimple_cond_lhs (stmt))
-			    || ! is_gimple_min_invariant
-				 (gimple_cond_rhs (stmt))))
-		       || (gimple_code (stmt) == GIMPLE_SWITCH
-			   && constant_after_peeling (gimple_switch_index (
-							as_a <gswitch *>
-							  (stmt)),
-						      stmt, loop)
-			   && ! is_gimple_min_invariant
-				   (gimple_switch_index
-				      (as_a <gswitch *> (stmt)))))
-		{
-		  if (dump_file && (dump_flags & TDF_DETAILS))
-		    fprintf (dump_file, "   Constant conditional.\n");
-		  likely_eliminated = true;
+		  if (tree lhs = gimple_get_lhs (stmt))
+		    if (TREE_CODE (lhs) == SSA_NAME)
+		      vals->put (lhs, val);
 		}
 	    }
 
+account:
 	  size->overall += num;
 	  if (likely_eliminated || likely_eliminated_peeled)
 	    size->eliminated_by_peeling += num;
@@ -386,11 +407,55 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 	  if ((size->overall * 3 / 2 - size->eliminated_by_peeling
 	      - size->last_iteration_eliminated_by_peeling) > upper_bound)
 	    {
-              free (body);
-	      return true;
+	      free (body);
+	      delete vals;
+	      vals = nullptr;
+	      return false;
 	    }
 	}
     }
+  return true;
+  };
+
+  /* Estimate the size of the unrolled first iteration.  */
+  if (!process_loop ())
+    return true;
+
+  /* Determine whether the IVs will stay constant (we simply assume that
+     if the 2nd iteration receives a constant value the third and all
+     further will so as well).  */
+  for (auto si = gsi_start_phis (loop->header);
+       !gsi_end_p (si); gsi_next (&si))
+    {
+      if (virtual_operand_p (gimple_phi_result (*si)))
+	continue;
+      tree def = gimple_phi_arg_def_from_edge (*si, loop_latch_edge (loop));
+      if (CONSTANT_CLASS_P (def) || vals->get (def))
+	/* ???  If we compute the first iteration size separately we
+	   could also handle an invariant backedge value more
+	   optimistically.
+	   ???  Note the actual value we leave here may still have an
+	   effect on the constant-ness.  */
+	;
+      else
+	vals->remove (gimple_phi_result (*si));
+    }
+
+  /* Reset sizes and compute the size based on the adjustment above.
+     ???  We could keep the more precise and optimistic counts for
+     the first iteration.  */
+  size->overall = 0;
+  size->eliminated_by_peeling = 0;
+  size->last_iteration = 0;
+  size->last_iteration_eliminated_by_peeling = 0;
+  size->num_pure_calls_on_hot_path = 0;
+  size->num_non_pure_calls_on_hot_path = 0;
+  size->non_call_stmts_on_hot_path = 0;
+  size->num_branches_on_hot_path = 0;
+  size->constant_iv = 0;
+  if (!process_loop ())
+    return true;
+
   while (path.length ())
     {
       basic_block bb = path.pop ();
@@ -412,13 +477,10 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 	  else if (gimple_code (stmt) != GIMPLE_DEBUG)
 	    size->non_call_stmts_on_hot_path++;
 	  if (((gimple_code (stmt) == GIMPLE_COND
-	        && (!constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
-		    || !constant_after_peeling (gimple_cond_rhs (stmt), stmt,
-						loop)))
+		&& (!vals->get (gimple_cond_lhs (stmt))
+		    || !vals->get (gimple_cond_rhs (stmt))))
 	       || (gimple_code (stmt) == GIMPLE_SWITCH
-		   && !constant_after_peeling (gimple_switch_index (
-						 as_a <gswitch *> (stmt)),
-					       stmt, loop)))
+		   && !vals->get (gimple_switch_index (as_a <gswitch *> (stmt)))))
 	      && (!exit || bb != exit->src))
 	    size->num_branches_on_hot_path++;
 	}
@@ -430,6 +492,8 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 	     size->last_iteration_eliminated_by_peeling);
 
   free (body);
+  delete vals;
+  vals = nullptr;
   return false;
 }
 
-- 
2.35.3


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-13  7:40 ` Richard Biener
@ 2024-05-15  2:14   ` Hongtao Liu
  2024-05-15  9:24     ` Richard Biener
  0 siblings, 1 reply; 12+ messages in thread
From: Hongtao Liu @ 2024-05-15  2:14 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, gcc-patches

On Mon, May 13, 2024 at 3:40 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > innermost loop and increase register pressure.
> > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > the reduction.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > No big impact for SPEC2017.
> > Ok for trunk?
>
> This removes the 1/3 reduction when unrolling a loop nest (the case I was
> concerned about).  Unrolling of a nest is by iterating in
> tree_unroll_loops_completely
> so the to be unrolled loop appears innermost.  So I think you need a new
> parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> first iteration (or whether to assume inner most loops will "simplify").
yes, it would be better.
>
> Few comments below
>
> > gcc/ChangeLog:
> >
> >         PR tree-optimization/112325
> >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> >         new parameters: loop and ul, and remove unr_insns reduction
> >         for innermost loop.
> >         (try_unroll_loop_completely): Pass loop and ul to
> >         estimated_unrolled_size.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.dg/tree-ssa/pr112325.c: New test.
> >         * gcc.dg/vect/pr69783.c: Add extra option --param
> >         max-completely-peeled-insns=300.
> > ---
> >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> >  3 files changed, 71 insertions(+), 4 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> >
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > new file mode 100644
> > index 00000000000..14208b3e7f8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > @@ -0,0 +1,57 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > +
> > +typedef unsigned short ggml_fp16_t;
> > +static float table_f32_f16[1 << 16];
> > +
> > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > +    unsigned short s;
> > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > +    return table_f32_f16[s];
> > +}
> > +
> > +typedef struct {
> > +    ggml_fp16_t d;
> > +    ggml_fp16_t m;
> > +    unsigned char qh[4];
> > +    unsigned char qs[32 / 2];
> > +} block_q5_1;
> > +
> > +typedef struct {
> > +    float d;
> > +    float s;
> > +    char qs[32];
> > +} block_q8_1;
> > +
> > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > +    const int qk = 32;
> > +    const int nb = n / qk;
> > +
> > +    const block_q5_1 * restrict x = vx;
> > +    const block_q8_1 * restrict y = vy;
> > +
> > +    float sumf = 0.0;
> > +
> > +    for (int i = 0; i < nb; i++) {
> > +        unsigned qh;
> > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > +
> > +        int sumi = 0;
> > +
> > +        for (int j = 0; j < qk/2; ++j) {
> > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > +
> > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > +
> > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > +        }
> > +
> > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > +    }
> > +
> > +    *s = sumf;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > index 5df95d0ce4e..a1f75514d72 100644
> > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-require-effective-target vect_float } */
> > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
>
> If we rely on unrolling of a loop can you put #pragma unroll [N]
> before the respective loop
> instead?
>
> >  #define NXX 516
> >  #define NYY 516
> > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > index bf017137260..5e0eca647a1 100644
> > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> >
> >  static unsigned HOST_WIDE_INT
> >  estimated_unrolled_size (struct loop_size *size,
> > -                        unsigned HOST_WIDE_INT nunroll)
> > +                        unsigned HOST_WIDE_INT nunroll,
> > +                        enum unroll_level ul,
> > +                        class loop* loop)
> >  {
> >    HOST_WIDE_INT unr_insns = ((nunroll)
> >                              * (HOST_WIDE_INT) (size->overall
> > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> >      unr_insns = 0;
> >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> >
> > -  unr_insns = unr_insns * 2 / 3;
> > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > +     and may increase a lot of register pressure.
> > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > +  class loop *loop_father = loop_outer (loop);
> > +  if (loop->inner || !loop_father
>
> Do we ever get here for !loop_father?  We shouldn't.
>
> > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
>
> This means you excempt all loops that are direct children of the loop
> root tree.  That doesn't make much sense.
>
> > +      || ul != UL_ALL)
>
> This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> than for UL_ALL?  This doesn't make much sense.
>
> Overall I think this means removal of being optimistic doesn't work so well?
They're mostly used to avoid testcase regressions., the regressed
testcases rely on the behavior of complete unroll from the first
unroll, but now it's only unrolled by the second unroll.
I checked some, the codegen are the same, I need to go through all of
them, if the final codegen are the same or optimal, I'll just adjust
testcases?

++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for

g++warnings, line 66)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for

g++warnings, line 66)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for

g++warnings, line 66)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for

g++warnings, line 66)

gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)

gcc: gcc.dg/graphite/interchange-8.c execution test

gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
"Invalid sum"

gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
iteration exit edge was proved true."

gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
iterations completely unrolled"

gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
<unsigned long> \\[\\(char \\*\\)\\&a8] = "

gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"

gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1

gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
"Invalid sum" 0

gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"

gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
"vectorized 3 loops" 1

>
> If we need some extra leeway for UL_NO_GROWTH for what we expect
> to unroll it might be better to add sth like --param
> nogrowth-completely-peeled-insns
> specifying a fixed surplus size?  Or we need to look at what's the problem
> with the testcases regressing or the one you are trying to fix.
>
> I did experiment with better estimating cleanup done at some point
> (see attached),
> but didn't get to finishing that (and as said, as we're running VN on the result
> we'd ideally do that as part of the estimation somehow).
>
> Richard.
>
> > +    unr_insns = unr_insns * 2 / 3;
> > +
> >    if (unr_insns <= 0)
> >      unr_insns = 1;
> >
> > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> >
> >           unsigned HOST_WIDE_INT ninsns = size.overall;
> >           unsigned HOST_WIDE_INT unr_insns
> > -           = estimated_unrolled_size (&size, n_unroll);
> > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> >           if (dump_file && (dump_flags & TDF_DETAILS))
> >             {
> >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > --
> > 2.31.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-15  2:14   ` Hongtao Liu
@ 2024-05-15  9:24     ` Richard Biener
  2024-05-15  9:49       ` Hongtao Liu
  2024-05-21  2:35       ` Hongtao Liu
  0 siblings, 2 replies; 12+ messages in thread
From: Richard Biener @ 2024-05-15  9:24 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches, Jan Hubicka

On Wed, May 15, 2024 at 4:15 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, May 13, 2024 at 3:40 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > > innermost loop and increase register pressure.
> > > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > > the reduction.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > No big impact for SPEC2017.
> > > Ok for trunk?
> >
> > This removes the 1/3 reduction when unrolling a loop nest (the case I was
> > concerned about).  Unrolling of a nest is by iterating in
> > tree_unroll_loops_completely
> > so the to be unrolled loop appears innermost.  So I think you need a new
> > parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> > first iteration (or whether to assume inner most loops will "simplify").
> yes, it would be better.
> >
> > Few comments below
> >
> > > gcc/ChangeLog:
> > >
> > >         PR tree-optimization/112325
> > >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > >         new parameters: loop and ul, and remove unr_insns reduction
> > >         for innermost loop.
> > >         (try_unroll_loop_completely): Pass loop and ul to
> > >         estimated_unrolled_size.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.dg/tree-ssa/pr112325.c: New test.
> > >         * gcc.dg/vect/pr69783.c: Add extra option --param
> > >         max-completely-peeled-insns=300.
> > > ---
> > >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> > >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> > >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> > >  3 files changed, 71 insertions(+), 4 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > >
> > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > new file mode 100644
> > > index 00000000000..14208b3e7f8
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > @@ -0,0 +1,57 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > > +
> > > +typedef unsigned short ggml_fp16_t;
> > > +static float table_f32_f16[1 << 16];
> > > +
> > > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > > +    unsigned short s;
> > > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > > +    return table_f32_f16[s];
> > > +}
> > > +
> > > +typedef struct {
> > > +    ggml_fp16_t d;
> > > +    ggml_fp16_t m;
> > > +    unsigned char qh[4];
> > > +    unsigned char qs[32 / 2];
> > > +} block_q5_1;
> > > +
> > > +typedef struct {
> > > +    float d;
> > > +    float s;
> > > +    char qs[32];
> > > +} block_q8_1;
> > > +
> > > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > > +    const int qk = 32;
> > > +    const int nb = n / qk;
> > > +
> > > +    const block_q5_1 * restrict x = vx;
> > > +    const block_q8_1 * restrict y = vy;
> > > +
> > > +    float sumf = 0.0;
> > > +
> > > +    for (int i = 0; i < nb; i++) {
> > > +        unsigned qh;
> > > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > > +
> > > +        int sumi = 0;
> > > +
> > > +        for (int j = 0; j < qk/2; ++j) {
> > > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > > +
> > > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > > +
> > > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > > +        }
> > > +
> > > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > > +    }
> > > +
> > > +    *s = sumf;
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > index 5df95d0ce4e..a1f75514d72 100644
> > > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > @@ -1,6 +1,6 @@
> > >  /* { dg-do compile } */
> > >  /* { dg-require-effective-target vect_float } */
> > > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
> >
> > If we rely on unrolling of a loop can you put #pragma unroll [N]
> > before the respective loop
> > instead?
> >
> > >  #define NXX 516
> > >  #define NYY 516
> > > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > > index bf017137260..5e0eca647a1 100644
> > > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> > >
> > >  static unsigned HOST_WIDE_INT
> > >  estimated_unrolled_size (struct loop_size *size,
> > > -                        unsigned HOST_WIDE_INT nunroll)
> > > +                        unsigned HOST_WIDE_INT nunroll,
> > > +                        enum unroll_level ul,
> > > +                        class loop* loop)
> > >  {
> > >    HOST_WIDE_INT unr_insns = ((nunroll)
> > >                              * (HOST_WIDE_INT) (size->overall
> > > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> > >      unr_insns = 0;
> > >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> > >
> > > -  unr_insns = unr_insns * 2 / 3;
> > > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > > +     and may increase a lot of register pressure.
> > > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > > +  class loop *loop_father = loop_outer (loop);
> > > +  if (loop->inner || !loop_father
> >
> > Do we ever get here for !loop_father?  We shouldn't.
> >
> > > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
> >
> > This means you excempt all loops that are direct children of the loop
> > root tree.  That doesn't make much sense.
> >
> > > +      || ul != UL_ALL)
> >
> > This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> > than for UL_ALL?  This doesn't make much sense.
> >
> > Overall I think this means removal of being optimistic doesn't work so well?
> They're mostly used to avoid testcase regressions., the regressed
> testcases rely on the behavior of complete unroll from the first
> unroll, but now it's only unrolled by the second unroll.
> I checked some, the codegen are the same, I need to go through all of
> them, if the final codegen are the same or optimal, I'll just adjust
> testcases?
>
> ++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
>
> g++warnings, line 66)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
>
> g++warnings, line 66)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
>
> g++warnings, line 66)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
>
> g++warnings, line 66)

This seems to expect unrolling for an init loop rolling 1 times.  I don't
see 1/3 of the stmts vanishing but it's definitely an interesting corner
case.  That's why I was thinking of maybe adding a --param specifying
an absolute growth we consider "no growth" - but of course that's
ugly as well but it would cover these small loops.

How do the sizes play out here after your change?  Before it's

size: 13-3, last_iteration: 2-2
  Loop size: 13
  Estimated size after unrolling: 13

and the init is quite complex with virtual pointer inits.  We do have

  size:   1 _14 = _5 + -1;
   Induction variable computation will be folded away.
  size:   1 _15 = _4 + 40;
 BB: 3, after_exit: 1

where we don't realize the + 40 of _15 will be folded into the dereferences
but that would only subtract 1.

  size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);

that's the biggest cost.

To diagnose the array bound issue we rely on early unrolling since we avoid
-Warray-bounds after late unrolling due to false positives.

This is definitely not an unrolling that preserves code size.

> gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
>
> gcc: gcc.dg/graphite/interchange-8.c execution test

An execute fail is bad ... can we avoid this (but file a bugreport!) when
placing #pragma GCC unroll before the innermost loop?  We should
probably honor that in early unrolling (not sure if we do).

> gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> "Invalid sum"
>
> gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> iteration exit edge was proved true."
>
> gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> iterations completely unrolled"

again the current estimate is the same before/after unrolling, here
we expect to retain one compare & branch.

> gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> <unsigned long> \\[\\(char \\*\\)\\&a8] = "
>
> gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"

again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
pass was abstraction penalty removal which works for low trip-count loops.
So maybe that new --param for allowed growth should scale but instead
of scaling by the loop size as 2/3 does it should scale by the number of
times we peel which means offsetting the body size estimate by a constant.

Honza?  Any idea how to go forward here?

Thanks,
Richard.

> gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
>
> gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> "Invalid sum" 0
>
> gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
>
> gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> "vectorized 3 loops" 1
>
> >
> > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > to unroll it might be better to add sth like --param
> > nogrowth-completely-peeled-insns
> > specifying a fixed surplus size?  Or we need to look at what's the problem
> > with the testcases regressing or the one you are trying to fix.
> >
> > I did experiment with better estimating cleanup done at some point
> > (see attached),
> > but didn't get to finishing that (and as said, as we're running VN on the result
> > we'd ideally do that as part of the estimation somehow).
> >
> > Richard.
> >
> > > +    unr_insns = unr_insns * 2 / 3;
> > > +
> > >    if (unr_insns <= 0)
> > >      unr_insns = 1;
> > >
> > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > >
> > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > >           unsigned HOST_WIDE_INT unr_insns
> > > -           = estimated_unrolled_size (&size, n_unroll);
> > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > >             {
> > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-15  9:24     ` Richard Biener
@ 2024-05-15  9:49       ` Hongtao Liu
  2024-05-21  2:35       ` Hongtao Liu
  1 sibling, 0 replies; 12+ messages in thread
From: Hongtao Liu @ 2024-05-15  9:49 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, gcc-patches, Jan Hubicka

C  -std=gnu++14 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
> >
> > g++warnings, line 66)
>
> This seems to expect unrolling for an init loop rolling 1 times.  I don't
> see 1/3 of the stmts vanishing but it's definitely an interesting corner
> case.  That's why I was thinking of maybe adding a --param specifying
> an absolute growth we consider "no growth" - but of course that's
> ugly as well but it would cover these small loops.
>
> How do the sizes play out here after your change?  Before it's
>
> size: 13-3, last_iteration: 2-2
>   Loop size: 13
>   Estimated size after unrolling: 13
After:
size: 13-3, last_iteration: 2-2
  Loop size: 13
  Estimated size after unrolling: 20
Not unrolling loop 1: size would grow.

>
> and the init is quite complex with virtual pointer inits.  We do have
>
>   size:   1 _14 = _5 + -1;
>    Induction variable computation will be folded away.
>   size:   1 _15 = _4 + 40;
>  BB: 3, after_exit: 1
>
> where we don't realize the + 40 of _15 will be folded into the dereferences
> but that would only subtract 1.
>
>   size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);
>
> that's the biggest cost.
>
> To diagnose the array bound issue we rely on early unrolling since we avoid
> -Warray-bounds after late unrolling due to false positives.
>
> This is definitely not an unrolling that preserves code size.
>
> > gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
> >
> > gcc: gcc.dg/graphite/interchange-8.c execution test
>
> An execute fail is bad ... can we avoid this (but file a bugreport!) when
It's PR115101
> placing #pragma GCC unroll before the innermost loop?  We should
> probably honor that in early unrolling (not sure if we do).
>
> > gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> > "Invalid sum"
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> > iteration exit edge was proved true."
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> > iterations completely unrolled"
>
> again the current estimate is the same before/after unrolling, here
> we expect to retain one compare & branch.
>
> > gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> > <unsigned long> \\[\\(char \\*\\)\\&a8] = "
> >
> > gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"
>
> again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
> pass was abstraction penalty removal which works for low trip-count loops.
> So maybe that new --param for allowed growth should scale but instead
> of scaling by the loop size as 2/3 does it should scale by the number of
> times we peel which means offsetting the body size estimate by a constant.
>
> Honza?  Any idea how to go forward here?
>
> Thanks,
> Richard.
>
> > gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
> >
> > gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> > "Invalid sum" 0
> >
> > gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
> >
> > gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> > "vectorized 3 loops" 1
> >
> > >
> > > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > > to unroll it might be better to add sth like --param
> > > nogrowth-completely-peeled-insns
> > > specifying a fixed surplus size?  Or we need to look at what's the problem
> > > with the testcases regressing or the one you are trying to fix.
> > >
> > > I did experiment with better estimating cleanup done at some point
> > > (see attached),
> > > but didn't get to finishing that (and as said, as we're running VN on the result
> > > we'd ideally do that as part of the estimation somehow).
> > >
> > > Richard.
> > >
> > > > +    unr_insns = unr_insns * 2 / 3;
> > > > +
> > > >    if (unr_insns <= 0)
> > > >      unr_insns = 1;
> > > >
> > > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > > >
> > > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > > >           unsigned HOST_WIDE_INT unr_insns
> > > > -           = estimated_unrolled_size (&size, n_unroll);
> > > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > > >             {
> > > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > > --
> > > > 2.31.1
> > > >
> >
> >
> >
> > --
> > BR,
> > Hongtao



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-15  9:24     ` Richard Biener
  2024-05-15  9:49       ` Hongtao Liu
@ 2024-05-21  2:35       ` Hongtao Liu
  2024-05-21  6:14         ` Richard Biener
  1 sibling, 1 reply; 12+ messages in thread
From: Hongtao Liu @ 2024-05-21  2:35 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, gcc-patches, Jan Hubicka

On Wed, May 15, 2024 at 5:24 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, May 15, 2024 at 4:15 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, May 13, 2024 at 3:40 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > > > innermost loop and increase register pressure.
> > > > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > > > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > > > the reduction.
> > > >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > No big impact for SPEC2017.
> > > > Ok for trunk?
> > >
> > > This removes the 1/3 reduction when unrolling a loop nest (the case I was
> > > concerned about).  Unrolling of a nest is by iterating in
> > > tree_unroll_loops_completely
> > > so the to be unrolled loop appears innermost.  So I think you need a new
> > > parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> > > first iteration (or whether to assume inner most loops will "simplify").
> > yes, it would be better.
> > >
> > > Few comments below
> > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR tree-optimization/112325
> > > >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > > >         new parameters: loop and ul, and remove unr_insns reduction
> > > >         for innermost loop.
> > > >         (try_unroll_loop_completely): Pass loop and ul to
> > > >         estimated_unrolled_size.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         * gcc.dg/tree-ssa/pr112325.c: New test.
> > > >         * gcc.dg/vect/pr69783.c: Add extra option --param
> > > >         max-completely-peeled-insns=300.
> > > > ---
> > > >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> > > >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> > > >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> > > >  3 files changed, 71 insertions(+), 4 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > >
> > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > new file mode 100644
> > > > index 00000000000..14208b3e7f8
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > @@ -0,0 +1,57 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > > > +
> > > > +typedef unsigned short ggml_fp16_t;
> > > > +static float table_f32_f16[1 << 16];
> > > > +
> > > > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > > > +    unsigned short s;
> > > > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > > > +    return table_f32_f16[s];
> > > > +}
> > > > +
> > > > +typedef struct {
> > > > +    ggml_fp16_t d;
> > > > +    ggml_fp16_t m;
> > > > +    unsigned char qh[4];
> > > > +    unsigned char qs[32 / 2];
> > > > +} block_q5_1;
> > > > +
> > > > +typedef struct {
> > > > +    float d;
> > > > +    float s;
> > > > +    char qs[32];
> > > > +} block_q8_1;
> > > > +
> > > > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > > > +    const int qk = 32;
> > > > +    const int nb = n / qk;
> > > > +
> > > > +    const block_q5_1 * restrict x = vx;
> > > > +    const block_q8_1 * restrict y = vy;
> > > > +
> > > > +    float sumf = 0.0;
> > > > +
> > > > +    for (int i = 0; i < nb; i++) {
> > > > +        unsigned qh;
> > > > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > > > +
> > > > +        int sumi = 0;
> > > > +
> > > > +        for (int j = 0; j < qk/2; ++j) {
> > > > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > > > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > > > +
> > > > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > > > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > > > +
> > > > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > > > +        }
> > > > +
> > > > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > > > +    }
> > > > +
> > > > +    *s = sumf;
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > > > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > index 5df95d0ce4e..a1f75514d72 100644
> > > > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > @@ -1,6 +1,6 @@
> > > >  /* { dg-do compile } */
> > > >  /* { dg-require-effective-target vect_float } */
> > > > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > > > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
> > >
> > > If we rely on unrolling of a loop can you put #pragma unroll [N]
> > > before the respective loop
> > > instead?
> > >
> > > >  #define NXX 516
> > > >  #define NYY 516
> > > > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > > > index bf017137260..5e0eca647a1 100644
> > > > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > > > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > > > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> > > >
> > > >  static unsigned HOST_WIDE_INT
> > > >  estimated_unrolled_size (struct loop_size *size,
> > > > -                        unsigned HOST_WIDE_INT nunroll)
> > > > +                        unsigned HOST_WIDE_INT nunroll,
> > > > +                        enum unroll_level ul,
> > > > +                        class loop* loop)
> > > >  {
> > > >    HOST_WIDE_INT unr_insns = ((nunroll)
> > > >                              * (HOST_WIDE_INT) (size->overall
> > > > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> > > >      unr_insns = 0;
> > > >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> > > >
> > > > -  unr_insns = unr_insns * 2 / 3;
> > > > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > > > +     and may increase a lot of register pressure.
> > > > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > > > +  class loop *loop_father = loop_outer (loop);
> > > > +  if (loop->inner || !loop_father
> > >
> > > Do we ever get here for !loop_father?  We shouldn't.
> > >
> > > > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
> > >
> > > This means you excempt all loops that are direct children of the loop
> > > root tree.  That doesn't make much sense.
> > >
> > > > +      || ul != UL_ALL)
> > >
> > > This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> > > than for UL_ALL?  This doesn't make much sense.
> > >
> > > Overall I think this means removal of being optimistic doesn't work so well?
> > They're mostly used to avoid testcase regressions., the regressed
> > testcases rely on the behavior of complete unroll from the first
> > unroll, but now it's only unrolled by the second unroll.
> > I checked some, the codegen are the same, I need to go through all of
> > them, if the final codegen are the same or optimal, I'll just adjust
> > testcases?
> >
> > ++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
> >
> > g++warnings, line 66)
>
> This seems to expect unrolling for an init loop rolling 1 times.  I don't
> see 1/3 of the stmts vanishing but it's definitely an interesting corner
> case.  That's why I was thinking of maybe adding a --param specifying
> an absolute growth we consider "no growth" - but of course that's
> ugly as well but it would cover these small loops.
>
> How do the sizes play out here after your change?  Before it's
>
> size: 13-3, last_iteration: 2-2
>   Loop size: 13
>   Estimated size after unrolling: 13
>
> and the init is quite complex with virtual pointer inits.  We do have
>
>   size:   1 _14 = _5 + -1;
>    Induction variable computation will be folded away.
>   size:   1 _15 = _4 + 40;
>  BB: 3, after_exit: 1
>
> where we don't realize the + 40 of _15 will be folded into the dereferences
> but that would only subtract 1.
>
>   size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);
>
> that's the biggest cost.
>
> To diagnose the array bound issue we rely on early unrolling since we avoid
> -Warray-bounds after late unrolling due to false positives.
>
> This is definitely not an unrolling that preserves code size.
>
> > gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
> >
> > gcc: gcc.dg/graphite/interchange-8.c execution test
>
> An execute fail is bad ... can we avoid this (but file a bugreport!) when
> placing #pragma GCC unroll before the innermost loop?  We should
> probably honor that in early unrolling (not sure if we do).
>
> > gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> > "Invalid sum"
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> > iteration exit edge was proved true."
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> > iterations completely unrolled"
>
> again the current estimate is the same before/after unrolling, here
> we expect to retain one compare & branch.
>
> > gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> > <unsigned long> \\[\\(char \\*\\)\\&a8] = "
> >
> > gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"
>
> again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
> pass was abstraction penalty removal which works for low trip-count loops.
> So maybe that new --param for allowed growth should scale but instead
> of scaling by the loop size as 2/3 does it should scale by the number of
> times we peel which means offsetting the body size estimate by a constant.
>
> Honza?  Any idea how to go forward here?
>
> Thanks,
> Richard.
>
> > gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
> >
> > gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> > "Invalid sum" 0
> >
> > gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
> >
> > gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> > "vectorized 3 loops" 1
> >
> > >
> > > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > > to unroll it might be better to add sth like --param
> > > nogrowth-completely-peeled-insns
Hard to find a default value satisfying all testcases.
some require loop unroll with 7 insns increment, some don't want loop
unroll w/ 5 insn increment.
The original 2/3 reduction happened to meet all those testcases(or the
testcases are constructed based on the old 2/3).
Can we define the parameter as the size of the loop, below the size we
still do the reduction, so the small loop can be unrolled?
> > > specifying a fixed surplus size?  Or we need to look at what's the problem
> > > with the testcases regressing or the one you are trying to fix.
> > >
> > > I did experiment with better estimating cleanup done at some point
> > > (see attached),
> > > but didn't get to finishing that (and as said, as we're running VN on the result
> > > we'd ideally do that as part of the estimation somehow).
> > >
> > > Richard.
> > >
> > > > +    unr_insns = unr_insns * 2 / 3;
> > > > +
> > > >    if (unr_insns <= 0)
> > > >      unr_insns = 1;
> > > >
> > > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > > >
> > > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > > >           unsigned HOST_WIDE_INT unr_insns
> > > > -           = estimated_unrolled_size (&size, n_unroll);
> > > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > > >             {
> > > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > > --
> > > > 2.31.1
> > > >
> >
> >
> >
> > --
> > BR,
> > Hongtao



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-21  2:35       ` Hongtao Liu
@ 2024-05-21  6:14         ` Richard Biener
  2024-05-22  5:07           ` [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli liuhongt
  0 siblings, 1 reply; 12+ messages in thread
From: Richard Biener @ 2024-05-21  6:14 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches, Jan Hubicka

On Tue, May 21, 2024 at 4:35 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, May 15, 2024 at 5:24 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, May 15, 2024 at 4:15 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, May 13, 2024 at 3:40 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > >
> > > > > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > > > > innermost loop and increase register pressure.
> > > > > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > > > > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > > > > the reduction.
> > > > >
> > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > > No big impact for SPEC2017.
> > > > > Ok for trunk?
> > > >
> > > > This removes the 1/3 reduction when unrolling a loop nest (the case I was
> > > > concerned about).  Unrolling of a nest is by iterating in
> > > > tree_unroll_loops_completely
> > > > so the to be unrolled loop appears innermost.  So I think you need a new
> > > > parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> > > > first iteration (or whether to assume inner most loops will "simplify").
> > > yes, it would be better.
> > > >
> > > > Few comments below
> > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >         PR tree-optimization/112325
> > > > >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > > > >         new parameters: loop and ul, and remove unr_insns reduction
> > > > >         for innermost loop.
> > > > >         (try_unroll_loop_completely): Pass loop and ul to
> > > > >         estimated_unrolled_size.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > >         * gcc.dg/tree-ssa/pr112325.c: New test.
> > > > >         * gcc.dg/vect/pr69783.c: Add extra option --param
> > > > >         max-completely-peeled-insns=300.
> > > > > ---
> > > > >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> > > > >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> > > > >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> > > > >  3 files changed, 71 insertions(+), 4 deletions(-)
> > > > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > >
> > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > > new file mode 100644
> > > > > index 00000000000..14208b3e7f8
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > > @@ -0,0 +1,57 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > > > > +
> > > > > +typedef unsigned short ggml_fp16_t;
> > > > > +static float table_f32_f16[1 << 16];
> > > > > +
> > > > > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > > > > +    unsigned short s;
> > > > > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > > > > +    return table_f32_f16[s];
> > > > > +}
> > > > > +
> > > > > +typedef struct {
> > > > > +    ggml_fp16_t d;
> > > > > +    ggml_fp16_t m;
> > > > > +    unsigned char qh[4];
> > > > > +    unsigned char qs[32 / 2];
> > > > > +} block_q5_1;
> > > > > +
> > > > > +typedef struct {
> > > > > +    float d;
> > > > > +    float s;
> > > > > +    char qs[32];
> > > > > +} block_q8_1;
> > > > > +
> > > > > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > > > > +    const int qk = 32;
> > > > > +    const int nb = n / qk;
> > > > > +
> > > > > +    const block_q5_1 * restrict x = vx;
> > > > > +    const block_q8_1 * restrict y = vy;
> > > > > +
> > > > > +    float sumf = 0.0;
> > > > > +
> > > > > +    for (int i = 0; i < nb; i++) {
> > > > > +        unsigned qh;
> > > > > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > > > > +
> > > > > +        int sumi = 0;
> > > > > +
> > > > > +        for (int j = 0; j < qk/2; ++j) {
> > > > > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > > > > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > > > > +
> > > > > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > > > > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > > > > +
> > > > > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > > > > +        }
> > > > > +
> > > > > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > > > > +    }
> > > > > +
> > > > > +    *s = sumf;
> > > > > +}
> > > > > +
> > > > > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > > > > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > > index 5df95d0ce4e..a1f75514d72 100644
> > > > > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > > @@ -1,6 +1,6 @@
> > > > >  /* { dg-do compile } */
> > > > >  /* { dg-require-effective-target vect_float } */
> > > > > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > > > > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
> > > >
> > > > If we rely on unrolling of a loop can you put #pragma unroll [N]
> > > > before the respective loop
> > > > instead?
> > > >
> > > > >  #define NXX 516
> > > > >  #define NYY 516
> > > > > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > > > > index bf017137260..5e0eca647a1 100644
> > > > > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > > > > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > > > > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> > > > >
> > > > >  static unsigned HOST_WIDE_INT
> > > > >  estimated_unrolled_size (struct loop_size *size,
> > > > > -                        unsigned HOST_WIDE_INT nunroll)
> > > > > +                        unsigned HOST_WIDE_INT nunroll,
> > > > > +                        enum unroll_level ul,
> > > > > +                        class loop* loop)
> > > > >  {
> > > > >    HOST_WIDE_INT unr_insns = ((nunroll)
> > > > >                              * (HOST_WIDE_INT) (size->overall
> > > > > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> > > > >      unr_insns = 0;
> > > > >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> > > > >
> > > > > -  unr_insns = unr_insns * 2 / 3;
> > > > > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > > > > +     and may increase a lot of register pressure.
> > > > > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > > > > +  class loop *loop_father = loop_outer (loop);
> > > > > +  if (loop->inner || !loop_father
> > > >
> > > > Do we ever get here for !loop_father?  We shouldn't.
> > > >
> > > > > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
> > > >
> > > > This means you excempt all loops that are direct children of the loop
> > > > root tree.  That doesn't make much sense.
> > > >
> > > > > +      || ul != UL_ALL)
> > > >
> > > > This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> > > > than for UL_ALL?  This doesn't make much sense.
> > > >
> > > > Overall I think this means removal of being optimistic doesn't work so well?
> > > They're mostly used to avoid testcase regressions., the regressed
> > > testcases rely on the behavior of complete unroll from the first
> > > unroll, but now it's only unrolled by the second unroll.
> > > I checked some, the codegen are the same, I need to go through all of
> > > them, if the final codegen are the same or optimal, I'll just adjust
> > > testcases?
> > >
> > > ++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
> > >
> > > g++warnings, line 66)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
> > >
> > > g++warnings, line 66)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
> > >
> > > g++warnings, line 66)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
> > >
> > > g++warnings, line 66)
> >
> > This seems to expect unrolling for an init loop rolling 1 times.  I don't
> > see 1/3 of the stmts vanishing but it's definitely an interesting corner
> > case.  That's why I was thinking of maybe adding a --param specifying
> > an absolute growth we consider "no growth" - but of course that's
> > ugly as well but it would cover these small loops.
> >
> > How do the sizes play out here after your change?  Before it's
> >
> > size: 13-3, last_iteration: 2-2
> >   Loop size: 13
> >   Estimated size after unrolling: 13
> >
> > and the init is quite complex with virtual pointer inits.  We do have
> >
> >   size:   1 _14 = _5 + -1;
> >    Induction variable computation will be folded away.
> >   size:   1 _15 = _4 + 40;
> >  BB: 3, after_exit: 1
> >
> > where we don't realize the + 40 of _15 will be folded into the dereferences
> > but that would only subtract 1.
> >
> >   size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);
> >
> > that's the biggest cost.
> >
> > To diagnose the array bound issue we rely on early unrolling since we avoid
> > -Warray-bounds after late unrolling due to false positives.
> >
> > This is definitely not an unrolling that preserves code size.
> >
> > > gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
> > >
> > > gcc: gcc.dg/graphite/interchange-8.c execution test
> >
> > An execute fail is bad ... can we avoid this (but file a bugreport!) when
> > placing #pragma GCC unroll before the innermost loop?  We should
> > probably honor that in early unrolling (not sure if we do).
> >
> > > gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> > > "Invalid sum"
> > >
> > > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> > > iteration exit edge was proved true."
> > >
> > > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> > > iterations completely unrolled"
> >
> > again the current estimate is the same before/after unrolling, here
> > we expect to retain one compare & branch.
> >
> > > gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> > > <unsigned long> \\[\\(char \\*\\)\\&a8] = "
> > >
> > > gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"
> >
> > again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
> > pass was abstraction penalty removal which works for low trip-count loops.
> > So maybe that new --param for allowed growth should scale but instead
> > of scaling by the loop size as 2/3 does it should scale by the number of
> > times we peel which means offsetting the body size estimate by a constant.
> >
> > Honza?  Any idea how to go forward here?
> >
> > Thanks,
> > Richard.
> >
> > > gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
> > >
> > > gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> > > "Invalid sum" 0
> > >
> > > gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
> > >
> > > gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> > > "vectorized 3 loops" 1
> > >
> > > >
> > > > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > > > to unroll it might be better to add sth like --param
> > > > nogrowth-completely-peeled-insns
> Hard to find a default value satisfying all testcases.
> some require loop unroll with 7 insns increment, some don't want loop
> unroll w/ 5 insn increment.
> The original 2/3 reduction happened to meet all those testcases(or the
> testcases are constructed based on the old 2/3).
> Can we define the parameter as the size of the loop, below the size we
> still do the reduction, so the small loop can be unrolled?

Yeah, that's also a sensible possibility.  Does it work to have a parameter
for the unrolled body size?  Thus, amend the existing
--param max-completely-peeled-insns with a --param
max-completely-peeled-insns-nogrowth?

Thanks,
Richard.

> > > > specifying a fixed surplus size?  Or we need to look at what's the problem
> > > > with the testcases regressing or the one you are trying to fix.
> > > >
> > > > I did experiment with better estimating cleanup done at some point
> > > > (see attached),
> > > > but didn't get to finishing that (and as said, as we're running VN on the result
> > > > we'd ideally do that as part of the estimation somehow).
> > > >
> > > > Richard.
> > > >
> > > > > +    unr_insns = unr_insns * 2 / 3;
> > > > > +
> > > > >    if (unr_insns <= 0)
> > > > >      unr_insns = 1;
> > > > >
> > > > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > > > >
> > > > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > > > >           unsigned HOST_WIDE_INT unr_insns
> > > > > -           = estimated_unrolled_size (&size, n_unroll);
> > > > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > > > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > > > >             {
> > > > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > > > --
> > > > > 2.31.1
> > > > >
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli.
  2024-05-21  6:14         ` Richard Biener
@ 2024-05-22  5:07           ` liuhongt
  2024-05-23  1:55             ` Hongtao Liu
  2024-05-23 11:59             ` Richard Biener
  0 siblings, 2 replies; 12+ messages in thread
From: liuhongt @ 2024-05-22  5:07 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther

>> Hard to find a default value satisfying all testcases.
>> some require loop unroll with 7 insns increment, some don't want loop
>> unroll w/ 5 insn increment.
>> The original 2/3 reduction happened to meet all those testcases(or the
>> testcases are constructed based on the old 2/3).
>> Can we define the parameter as the size of the loop, below the size we
>> still do the reduction, so the small loop can be unrolled?

>Yeah, that's also a sensible possibility.  Does it work to have a parameter
>for the unrolled body size?  Thus, amend the existing
>--param max-completely-peeled-insns with a --param
>max-completely-peeled-insns-nogrowth?

Update V2:
It's still hard to find a default value for loop boday size. So I move the
2 / 3 reduction from estimated_unrolled_size to try_unroll_loop_completely.
For the check of body size shrink, 2 / 3 reduction is added, so small loops
can still be unrolled.
For the check of comparison between body size and param_max_completely_peeled_insns,
2 / 3 is conditionally added for loop->inner || !cunrolli.
Then the patch avoid gcc testsuite regression, and also prevent big inner loop
completely unrolled at cunrolli.

------------------

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure. The patch move
from estimated_unrolled_size to
the 2/3 reduction at cunrolli.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/112325
	* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
	2 / 3 loop body size reduction to ..
	(try_unroll_loop_completely): .. here, add it for the check of
	body size shrink, and the check of comparison against
	param_max_completely_peeled_insns when
	(!cunrolli ||loop->inner).
	(canonicalize_loop_induction_variables): Add new parameter
	cunrolli and pass down.
	(tree_unroll_loops_completely_1): Ditto.
	(tree_unroll_loops_completely): Ditto.
	(canonicalize_induction_variables): Handle new parameter.
	(pass_complete_unrolli::execute): Ditto.
	(pass_complete_unroll::execute): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/pr112325.c: New test.
	* gcc.dg/vect/pr69783.c: Add extra option --param
	max-completely-peeled-insns=300.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc             | 45 ++++++++++---------
 3 files changed, 83 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
new file mode 100644
index 00000000000..14208b3e7f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    unsigned short s;
+    __builtin_memcpy(&s, &f, sizeof(unsigned short));
+    return table_f32_f16[s];
+}
+
+typedef struct {
+    ggml_fp16_t d;
+    ggml_fp16_t m;
+    unsigned char qh[4];
+    unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+    float d;
+    float s;
+    char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = 32;
+    const int nb = n / qk;
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        unsigned qh;
+        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
index 5df95d0ce4e..a1f75514d72 100644
--- a/gcc/testsuite/gcc.dg/vect/pr69783.c
+++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_float } */
-/* { dg-additional-options "-Ofast -funroll-loops" } */
+/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
 
 #define NXX 516
 #define NYY 516
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..cc53eee1301 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
    It is (NUNROLL + 1) * size of loop body with taking into account
    the fact that in last copy everything after exit conditional
    is dead and that some instructions will be eliminated after
-   peeling.
-
-   Loop body is likely going to simplify further, this is difficult
-   to guess, we just decrease the result by 1/3.  */
-
+   peeling.  */
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
 			 unsigned HOST_WIDE_INT nunroll)
@@ -453,7 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
     unr_insns = 0;
   unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
 
-  unr_insns = unr_insns * 2 / 3;
   if (unr_insns <= 0)
     unr_insns = 1;
 
@@ -734,7 +729,8 @@ try_unroll_loop_completely (class loop *loop,
 			    edge exit, tree niter, bool may_be_zero,
 			    enum unroll_level ul,
 			    HOST_WIDE_INT maxiter,
-			    dump_user_location_t locus, bool allow_peel)
+			    dump_user_location_t locus, bool allow_peel,
+			    bool cunrolli)
 {
   unsigned HOST_WIDE_INT n_unroll = 0;
   bool n_unroll_found = false;
@@ -847,8 +843,9 @@ try_unroll_loop_completely (class loop *loop,
 
 	  /* If the code is going to shrink, we don't need to be extra
 	     cautious on guessing if the unrolling is going to be
-	     profitable.  */
-	  if (unr_insns
+	     profitable.
+	     Move from estimated_unrolled_size to unroll small loops.  */
+	  if (unr_insns * 2 / 3
 	      /* If there is IV variable that will become constant, we
 		 save one instruction in the loop prologue we do not
 		 account otherwise.  */
@@ -919,7 +916,13 @@ try_unroll_loop_completely (class loop *loop,
 			 loop->num);
 	      return false;
 	    }
-	  else if (unr_insns
+	  /* Move 2 / 3 reduction from estimated_unrolled_size, but don't reduce
+	     unrolled size for innermost loop when cunrolli.
+	     1) It could increase register pressure.
+	     2) Big loop after completely unroll may not be vectorized
+	     by BB vectorizer.  */
+	  else if ((cunrolli && !loop->inner
+		    ? unr_insns : unr_insns * 2 / 3)
 		   > (unsigned) param_max_completely_peeled_insns)
 	    {
 	      if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1227,7 +1230,7 @@ try_peel_loop (class loop *loop,
 static bool
 canonicalize_loop_induction_variables (class loop *loop,
 				       bool create_iv, enum unroll_level ul,
-				       bool try_eval, bool allow_peel)
+				       bool try_eval, bool allow_peel, bool cunrolli)
 {
   edge exit = NULL;
   tree niter;
@@ -1314,7 +1317,7 @@ canonicalize_loop_induction_variables (class loop *loop,
 
   dump_user_location_t locus = find_loop_location (loop);
   if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
-				  maxiter, locus, allow_peel))
+				  maxiter, locus, allow_peel, cunrolli))
     return true;
 
   if (create_iv
@@ -1358,7 +1361,7 @@ canonicalize_induction_variables (void)
     {
       changed |= canonicalize_loop_induction_variables (loop,
 							true, UL_SINGLE_ITER,
-							true, false);
+							true, false, false);
     }
   gcc_assert (!need_ssa_update_p (cfun));
 
@@ -1392,7 +1395,7 @@ canonicalize_induction_variables (void)
 
 static bool
 tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
-				bitmap father_bbs, class loop *loop)
+				bitmap father_bbs, class loop *loop, bool cunrolli)
 {
   class loop *loop_father;
   bool changed = false;
@@ -1410,7 +1413,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
 	if (!child_father_bbs)
 	  child_father_bbs = BITMAP_ALLOC (NULL);
 	if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
-					    child_father_bbs, inner))
+					    child_father_bbs, inner, cunrolli))
 	  {
 	    bitmap_ior_into (father_bbs, child_father_bbs);
 	    bitmap_clear (child_father_bbs);
@@ -1456,7 +1459,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
     ul = UL_NO_GROWTH;
 
   if (canonicalize_loop_induction_variables
-        (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer))
+      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
     {
       /* If we'll continue unrolling, we need to propagate constants
 	 within the new basic blocks to fold away induction variable
@@ -1485,7 +1488,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
    size of the code does not increase.  */
 
 static unsigned int
-tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
+tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer,
+			      bool cunrolli)
 {
   bitmap father_bbs = BITMAP_ALLOC (NULL);
   bool changed;
@@ -1507,7 +1511,8 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
 
       changed = tree_unroll_loops_completely_1 (may_increase_size,
 						unroll_outer, father_bbs,
-						current_loops->tree_root);
+						current_loops->tree_root,
+						cunrolli);
       if (changed)
 	{
 	  unsigned i;
@@ -1671,7 +1676,7 @@ pass_complete_unroll::execute (function *fun)
   if (flag_peel_loops)
     peeled_loops = BITMAP_ALLOC (NULL);
   unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, 
-						   true);
+						   true, false);
   if (peeled_loops)
     {
       BITMAP_FREE (peeled_loops);
@@ -1727,7 +1732,7 @@ pass_complete_unrolli::execute (function *fun)
   if (number_of_loops (fun) > 1)
     {
       scev_initialize ();
-      ret = tree_unroll_loops_completely (optimize >= 3, false);
+      ret = tree_unroll_loops_completely (optimize >= 3, false, true);
       scev_finalize ();
     }
   loop_optimizer_finalize ();
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli.
  2024-05-22  5:07           ` [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli liuhongt
@ 2024-05-23  1:55             ` Hongtao Liu
  2024-05-23 11:59             ` Richard Biener
  1 sibling, 0 replies; 12+ messages in thread
From: Hongtao Liu @ 2024-05-23  1:55 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches, richard.guenther

On Wed, May 22, 2024 at 1:07 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> >> Hard to find a default value satisfying all testcases.
> >> some require loop unroll with 7 insns increment, some don't want loop
> >> unroll w/ 5 insn increment.
> >> The original 2/3 reduction happened to meet all those testcases(or the
> >> testcases are constructed based on the old 2/3).
> >> Can we define the parameter as the size of the loop, below the size we
> >> still do the reduction, so the small loop can be unrolled?
>
> >Yeah, that's also a sensible possibility.  Does it work to have a parameter
> >for the unrolled body size?  Thus, amend the existing
> >--param max-completely-peeled-insns with a --param
> >max-completely-peeled-insns-nogrowth?
>
> Update V2:
> It's still hard to find a default value for loop boday size. So I move the
> 2 / 3 reduction from estimated_unrolled_size to try_unroll_loop_completely.
> For the check of body size shrink, 2 / 3 reduction is added, so small loops
> can still be unrolled.
> For the check of comparison between body size and param_max_completely_peeled_insns,
> 2 / 3 is conditionally added for loop->inner || !cunrolli.
> Then the patch avoid gcc testsuite regression, and also prevent big inner loop
> completely unrolled at cunrolli.
The patch regressed arm-*-eabi for

FAIL: 3 regressions



regressions.sum:

                                === gcc tests ===



Running gcc:gcc.dg/tree-ssa/tree-ssa.exp ...

FAIL: gcc.dg/tree-ssa/pr83403-1.c scan-tree-dump-times lim2 "Executing
store motion of" 10

FAIL: gcc.dg/tree-ssa/pr83403-2.c scan-tree-dump-times lim2 "Executing
store motion of" 10

                                === gfortran tests ===



Running gfortran:gfortran.dg/dg.exp ...

FAIL: gfortran.dg/reassoc_4.f -O   scan-tree-dump-times reassoc1 "[0-9] \\* " 22

for 32-bit arm, estimate_num_insns_seq returns more for load/store of double.

The loop in pr83403-1.c
 198Estimating sizes for loop 4
 199 BB: 6, after_exit: 0
 200  size:   2 if (m_23 != 10)
 201   Exit condition will be eliminated in peeled copies.
 202   Exit condition will be eliminated in last copy.
 203   Constant conditional.
 204 BB: 5, after_exit: 1
 205  size:   1 _5 = n_24 * 10;
 206  size:   1 _6 = _5 + m_23;
 207  size:   1 _7 = _6 * 8;
 208  size:   1 _8 = C_35 + _7;
 209  size:   2 _9 = *_8;
 210  size:   1 _10 = k_25 * 20;
 211  size:   1 _11 = _10 + m_23;
 212  size:   1 _12 = _11 * 8;
 213  size:   1 _13 = A_31 + _12;
 214  size:   2 _14 = *_13;
 215  size:   1 _15 = n_24 * 20;
 216  size:   1 _16 = _15 + k_25;
 217  size:   1 _17 = _16 * 8;
 218  size:   1 _18 = B_33 + _17;
 219  size:   2 _19 = *_18;
 220  size:   1 _20 = _14 * _19;
 221  size:   1 _21 = _9 + _20;
 222  size:   2 *_8 = _21;
 223  size:   1 m_40 = m_23 + 1;
 224   Induction variable computation will be folded away.
 225size: 25-3, last_iteration: 2-2
 226  Loop size: 25
 227  Estimated size after unrolling: 220

For aarch64 and x86, it's ok

 198Estimating sizes for loop 4
 199 BB: 6, after_exit: 0
 200  size:   2 if (m_27 != 10)
 201   Exit condition will be eliminated in peeled copies.
 202   Exit condition will be eliminated in last copy.
 203   Constant conditional.
 204 BB: 5, after_exit: 1
 205  size:   1 _6 = n_28 * 10;
 206  size:   1 _7 = _6 + m_27;
 207  size:   0 _8 = (long unsigned int) _7;
 208  size:   1 _9 = _8 * 8;
 209  size:   1 _10 = C_39 + _9;
 210  size:   1 _11 = *_10;
 211  size:   1 _12 = k_29 * 20;
 212  size:   1 _13 = _12 + m_27;
 213  size:   0 _14 = (long unsigned int) _13;
 214  size:   1 _15 = _14 * 8;
 215  size:   1 _16 = A_35 + _15;
 216  size:   1 _17 = *_16;
 217  size:   1 _18 = n_28 * 20;
 218  size:   1 _19 = _18 + k_29;
 219  size:   0 _20 = (long unsigned int) _19;
 220  size:   1 _21 = _20 * 8;
 221  size:   1 _22 = B_37 + _21;
 222  size:   1 _23 = *_22;
 223  size:   1 _24 = _17 * _23;
 224  size:   1 _25 = _11 + _24;
 225  size:   1 *_10 = _25;
 226  size:   1 m_44 = m_27 + 1;
 227   Induction variable computation will be folded away.
 228size: 21-3, last_iteration: 2-2
 229  Loop size: 21
 230  Estimated size after unrolling: 180

>
> ------------------
>
> For the innermost loop, after completely loop unroll, it will most likely
> not be able to reduce the body size to 2/3. The current 2/3 reduction
> will make some of the larger loops completely unrolled during
> cunrolli, which will then result in them not being able to be
> vectorized. It also increases the register pressure. The patch move
> from estimated_unrolled_size to
> the 2/3 reduction at cunrolli.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR tree-optimization/112325
>         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
>         2 / 3 loop body size reduction to ..
>         (try_unroll_loop_completely): .. here, add it for the check of
>         body size shrink, and the check of comparison against
>         param_max_completely_peeled_insns when
>         (!cunrolli ||loop->inner).
>         (canonicalize_loop_induction_variables): Add new parameter
>         cunrolli and pass down.
>         (tree_unroll_loops_completely_1): Ditto.
>         (tree_unroll_loops_completely): Ditto.
>         (canonicalize_induction_variables): Handle new parameter.
>         (pass_complete_unrolli::execute): Ditto.
>         (pass_complete_unroll::execute): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/pr112325.c: New test.
>         * gcc.dg/vect/pr69783.c: Add extra option --param
>         max-completely-peeled-insns=300.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
>  gcc/tree-ssa-loop-ivcanon.cc             | 45 ++++++++++---------
>  3 files changed, 83 insertions(+), 21 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> new file mode 100644
> index 00000000000..14208b3e7f8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> +
> +typedef unsigned short ggml_fp16_t;
> +static float table_f32_f16[1 << 16];
> +
> +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> +    unsigned short s;
> +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> +    return table_f32_f16[s];
> +}
> +
> +typedef struct {
> +    ggml_fp16_t d;
> +    ggml_fp16_t m;
> +    unsigned char qh[4];
> +    unsigned char qs[32 / 2];
> +} block_q5_1;
> +
> +typedef struct {
> +    float d;
> +    float s;
> +    char qs[32];
> +} block_q8_1;
> +
> +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> +    const int qk = 32;
> +    const int nb = n / qk;
> +
> +    const block_q5_1 * restrict x = vx;
> +    const block_q8_1 * restrict y = vy;
> +
> +    float sumf = 0.0;
> +
> +    for (int i = 0; i < nb; i++) {
> +        unsigned qh;
> +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> +
> +        int sumi = 0;
> +
> +        for (int j = 0; j < qk/2; ++j) {
> +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> +
> +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> +
> +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> +        }
> +
> +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> +    }
> +
> +    *s = sumf;
> +}
> +
> +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> index 5df95d0ce4e..a1f75514d72 100644
> --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-require-effective-target vect_float } */
> -/* { dg-additional-options "-Ofast -funroll-loops" } */
> +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
>
>  #define NXX 516
>  #define NYY 516
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index bf017137260..cc53eee1301 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
>     It is (NUNROLL + 1) * size of loop body with taking into account
>     the fact that in last copy everything after exit conditional
>     is dead and that some instructions will be eliminated after
> -   peeling.
> -
> -   Loop body is likely going to simplify further, this is difficult
> -   to guess, we just decrease the result by 1/3.  */
> -
> +   peeling.  */
>  static unsigned HOST_WIDE_INT
>  estimated_unrolled_size (struct loop_size *size,
>                          unsigned HOST_WIDE_INT nunroll)
> @@ -453,7 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
>      unr_insns = 0;
>    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
>
> -  unr_insns = unr_insns * 2 / 3;
>    if (unr_insns <= 0)
>      unr_insns = 1;
>
> @@ -734,7 +729,8 @@ try_unroll_loop_completely (class loop *loop,
>                             edge exit, tree niter, bool may_be_zero,
>                             enum unroll_level ul,
>                             HOST_WIDE_INT maxiter,
> -                           dump_user_location_t locus, bool allow_peel)
> +                           dump_user_location_t locus, bool allow_peel,
> +                           bool cunrolli)
>  {
>    unsigned HOST_WIDE_INT n_unroll = 0;
>    bool n_unroll_found = false;
> @@ -847,8 +843,9 @@ try_unroll_loop_completely (class loop *loop,
>
>           /* If the code is going to shrink, we don't need to be extra
>              cautious on guessing if the unrolling is going to be
> -            profitable.  */
> -         if (unr_insns
> +            profitable.
> +            Move from estimated_unrolled_size to unroll small loops.  */
> +         if (unr_insns * 2 / 3
>               /* If there is IV variable that will become constant, we
>                  save one instruction in the loop prologue we do not
>                  account otherwise.  */
> @@ -919,7 +916,13 @@ try_unroll_loop_completely (class loop *loop,
>                          loop->num);
>               return false;
>             }
> -         else if (unr_insns
> +         /* Move 2 / 3 reduction from estimated_unrolled_size, but don't reduce
> +            unrolled size for innermost loop when cunrolli.
> +            1) It could increase register pressure.
> +            2) Big loop after completely unroll may not be vectorized
> +            by BB vectorizer.  */
> +         else if ((cunrolli && !loop->inner
> +                   ? unr_insns : unr_insns * 2 / 3)
>                    > (unsigned) param_max_completely_peeled_insns)
>             {
>               if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1227,7 +1230,7 @@ try_peel_loop (class loop *loop,
>  static bool
>  canonicalize_loop_induction_variables (class loop *loop,
>                                        bool create_iv, enum unroll_level ul,
> -                                      bool try_eval, bool allow_peel)
> +                                      bool try_eval, bool allow_peel, bool cunrolli)
>  {
>    edge exit = NULL;
>    tree niter;
> @@ -1314,7 +1317,7 @@ canonicalize_loop_induction_variables (class loop *loop,
>
>    dump_user_location_t locus = find_loop_location (loop);
>    if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
> -                                 maxiter, locus, allow_peel))
> +                                 maxiter, locus, allow_peel, cunrolli))
>      return true;
>
>    if (create_iv
> @@ -1358,7 +1361,7 @@ canonicalize_induction_variables (void)
>      {
>        changed |= canonicalize_loop_induction_variables (loop,
>                                                         true, UL_SINGLE_ITER,
> -                                                       true, false);
> +                                                       true, false, false);
>      }
>    gcc_assert (!need_ssa_update_p (cfun));
>
> @@ -1392,7 +1395,7 @@ canonicalize_induction_variables (void)
>
>  static bool
>  tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
> -                               bitmap father_bbs, class loop *loop)
> +                               bitmap father_bbs, class loop *loop, bool cunrolli)
>  {
>    class loop *loop_father;
>    bool changed = false;
> @@ -1410,7 +1413,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>         if (!child_father_bbs)
>           child_father_bbs = BITMAP_ALLOC (NULL);
>         if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
> -                                           child_father_bbs, inner))
> +                                           child_father_bbs, inner, cunrolli))
>           {
>             bitmap_ior_into (father_bbs, child_father_bbs);
>             bitmap_clear (child_father_bbs);
> @@ -1456,7 +1459,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>      ul = UL_NO_GROWTH;
>
>    if (canonicalize_loop_induction_variables
> -        (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer))
> +      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
>      {
>        /* If we'll continue unrolling, we need to propagate constants
>          within the new basic blocks to fold away induction variable
> @@ -1485,7 +1488,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>     size of the code does not increase.  */
>
>  static unsigned int
> -tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
> +tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer,
> +                             bool cunrolli)
>  {
>    bitmap father_bbs = BITMAP_ALLOC (NULL);
>    bool changed;
> @@ -1507,7 +1511,8 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
>
>        changed = tree_unroll_loops_completely_1 (may_increase_size,
>                                                 unroll_outer, father_bbs,
> -                                               current_loops->tree_root);
> +                                               current_loops->tree_root,
> +                                               cunrolli);
>        if (changed)
>         {
>           unsigned i;
> @@ -1671,7 +1676,7 @@ pass_complete_unroll::execute (function *fun)
>    if (flag_peel_loops)
>      peeled_loops = BITMAP_ALLOC (NULL);
>    unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
> -                                                  true);
> +                                                  true, false);
>    if (peeled_loops)
>      {
>        BITMAP_FREE (peeled_loops);
> @@ -1727,7 +1732,7 @@ pass_complete_unrolli::execute (function *fun)
>    if (number_of_loops (fun) > 1)
>      {
>        scev_initialize ();
> -      ret = tree_unroll_loops_completely (optimize >= 3, false);
> +      ret = tree_unroll_loops_completely (optimize >= 3, false, true);
>        scev_finalize ();
>      }
>    loop_optimizer_finalize ();
> --
> 2.31.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli.
  2024-05-22  5:07           ` [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli liuhongt
  2024-05-23  1:55             ` Hongtao Liu
@ 2024-05-23 11:59             ` Richard Biener
  2024-05-24  7:29               ` [V3 PATCH] Don't reduce estimated unrolled size for innermost loop liuhongt
  1 sibling, 1 reply; 12+ messages in thread
From: Richard Biener @ 2024-05-23 11:59 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches

On Wed, May 22, 2024 at 7:07 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> >> Hard to find a default value satisfying all testcases.
> >> some require loop unroll with 7 insns increment, some don't want loop
> >> unroll w/ 5 insn increment.
> >> The original 2/3 reduction happened to meet all those testcases(or the
> >> testcases are constructed based on the old 2/3).
> >> Can we define the parameter as the size of the loop, below the size we
> >> still do the reduction, so the small loop can be unrolled?
>
> >Yeah, that's also a sensible possibility.  Does it work to have a parameter
> >for the unrolled body size?  Thus, amend the existing
> >--param max-completely-peeled-insns with a --param
> >max-completely-peeled-insns-nogrowth?
>
> Update V2:
> It's still hard to find a default value for loop boday size. So I move the
> 2 / 3 reduction from estimated_unrolled_size to try_unroll_loop_completely.
> For the check of body size shrink, 2 / 3 reduction is added, so small loops
> can still be unrolled.
> For the check of comparison between body size and param_max_completely_peeled_insns,
> 2 / 3 is conditionally added for loop->inner || !cunrolli.
> Then the patch avoid gcc testsuite regression, and also prevent big inner loop
> completely unrolled at cunrolli.
>
> ------------------
>
> For the innermost loop, after completely loop unroll, it will most likely
> not be able to reduce the body size to 2/3. The current 2/3 reduction
> will make some of the larger loops completely unrolled during
> cunrolli, which will then result in them not being able to be
> vectorized. It also increases the register pressure. The patch move
> from estimated_unrolled_size to
> the 2/3 reduction at cunrolli.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR tree-optimization/112325
>         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
>         2 / 3 loop body size reduction to ..
>         (try_unroll_loop_completely): .. here, add it for the check of
>         body size shrink, and the check of comparison against
>         param_max_completely_peeled_insns when
>         (!cunrolli ||loop->inner).
>         (canonicalize_loop_induction_variables): Add new parameter
>         cunrolli and pass down.
>         (tree_unroll_loops_completely_1): Ditto.
>         (tree_unroll_loops_completely): Ditto.
>         (canonicalize_induction_variables): Handle new parameter.
>         (pass_complete_unrolli::execute): Ditto.
>         (pass_complete_unroll::execute): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/pr112325.c: New test.
>         * gcc.dg/vect/pr69783.c: Add extra option --param
>         max-completely-peeled-insns=300.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
>  gcc/tree-ssa-loop-ivcanon.cc             | 45 ++++++++++---------
>  3 files changed, 83 insertions(+), 21 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> new file mode 100644
> index 00000000000..14208b3e7f8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> +
> +typedef unsigned short ggml_fp16_t;
> +static float table_f32_f16[1 << 16];
> +
> +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> +    unsigned short s;
> +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> +    return table_f32_f16[s];
> +}
> +
> +typedef struct {
> +    ggml_fp16_t d;
> +    ggml_fp16_t m;
> +    unsigned char qh[4];
> +    unsigned char qs[32 / 2];
> +} block_q5_1;
> +
> +typedef struct {
> +    float d;
> +    float s;
> +    char qs[32];
> +} block_q8_1;
> +
> +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> +    const int qk = 32;
> +    const int nb = n / qk;
> +
> +    const block_q5_1 * restrict x = vx;
> +    const block_q8_1 * restrict y = vy;
> +
> +    float sumf = 0.0;
> +
> +    for (int i = 0; i < nb; i++) {
> +        unsigned qh;
> +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> +
> +        int sumi = 0;
> +
> +        for (int j = 0; j < qk/2; ++j) {
> +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> +
> +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> +
> +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> +        }
> +
> +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> +    }
> +
> +    *s = sumf;
> +}
> +
> +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */

Since this was about vectorization can you instead add a testcase to
gcc.dg/vect/ and check for
vectorization to happen?

> diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> index 5df95d0ce4e..a1f75514d72 100644
> --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-require-effective-target vect_float } */
> -/* { dg-additional-options "-Ofast -funroll-loops" } */
> +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */

It _looks_ like this was maybe also vectorizer related?  Can you
double-check the PR?
We don't seem to check for whether we vectorize, does this change with
the default
--param max-completely-peeled-insns?

I'd rather have a #pragma GCC unroll before the loop we need unrolled than an
adjusted --param max-completely-peeled-insns.  But if we just trade
one vectorized
loop for another I'm not so sure about the patch.

>  #define NXX 516
>  #define NYY 516
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index bf017137260..cc53eee1301 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
>     It is (NUNROLL + 1) * size of loop body with taking into account
>     the fact that in last copy everything after exit conditional
>     is dead and that some instructions will be eliminated after
> -   peeling.
> -
> -   Loop body is likely going to simplify further, this is difficult
> -   to guess, we just decrease the result by 1/3.  */
> -
> +   peeling.  */
>  static unsigned HOST_WIDE_INT
>  estimated_unrolled_size (struct loop_size *size,
>                          unsigned HOST_WIDE_INT nunroll)
> @@ -453,7 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
>      unr_insns = 0;
>    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
>
> -  unr_insns = unr_insns * 2 / 3;
>    if (unr_insns <= 0)
>      unr_insns = 1;

I believe the if (unr_insn <= 0) check can go as well.

> @@ -734,7 +729,8 @@ try_unroll_loop_completely (class loop *loop,
>                             edge exit, tree niter, bool may_be_zero,
>                             enum unroll_level ul,
>                             HOST_WIDE_INT maxiter,
> -                           dump_user_location_t locus, bool allow_peel)
> +                           dump_user_location_t locus, bool allow_peel,
> +                           bool cunrolli)
>  {
>    unsigned HOST_WIDE_INT n_unroll = 0;
>    bool n_unroll_found = false;
> @@ -847,8 +843,9 @@ try_unroll_loop_completely (class loop *loop,
>
>           /* If the code is going to shrink, we don't need to be extra
>              cautious on guessing if the unrolling is going to be
> -            profitable.  */
> -         if (unr_insns
> +            profitable.
> +            Move from estimated_unrolled_size to unroll small loops.  */
> +         if (unr_insns * 2 / 3
>               /* If there is IV variable that will become constant, we
>                  save one instruction in the loop prologue we do not
>                  account otherwise.  */
> @@ -919,7 +916,13 @@ try_unroll_loop_completely (class loop *loop,
>                          loop->num);
>               return false;
>             }
> -         else if (unr_insns
> +         /* Move 2 / 3 reduction from estimated_unrolled_size, but don't reduce
> +            unrolled size for innermost loop when cunrolli.
> +            1) It could increase register pressure.
> +            2) Big loop after completely unroll may not be vectorized
> +            by BB vectorizer.  */
> +         else if ((cunrolli && !loop->inner
> +                   ? unr_insns : unr_insns * 2 / 3)
>                    > (unsigned) param_max_completely_peeled_insns)
>             {
>               if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1227,7 +1230,7 @@ try_peel_loop (class loop *loop,
>  static bool
>  canonicalize_loop_induction_variables (class loop *loop,
>                                        bool create_iv, enum unroll_level ul,
> -                                      bool try_eval, bool allow_peel)
> +                                      bool try_eval, bool allow_peel, bool cunrolli)
>  {
>    edge exit = NULL;
>    tree niter;
> @@ -1314,7 +1317,7 @@ canonicalize_loop_induction_variables (class loop *loop,
>
>    dump_user_location_t locus = find_loop_location (loop);
>    if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
> -                                 maxiter, locus, allow_peel))
> +                                 maxiter, locus, allow_peel, cunrolli))
>      return true;
>
>    if (create_iv
> @@ -1358,7 +1361,7 @@ canonicalize_induction_variables (void)
>      {
>        changed |= canonicalize_loop_induction_variables (loop,
>                                                         true, UL_SINGLE_ITER,
> -                                                       true, false);
> +                                                       true, false, false);
>      }
>    gcc_assert (!need_ssa_update_p (cfun));
>
> @@ -1392,7 +1395,7 @@ canonicalize_induction_variables (void)
>
>  static bool
>  tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
> -                               bitmap father_bbs, class loop *loop)
> +                               bitmap father_bbs, class loop *loop, bool cunrolli)
>  {
>    class loop *loop_father;
>    bool changed = false;
> @@ -1410,7 +1413,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>         if (!child_father_bbs)
>           child_father_bbs = BITMAP_ALLOC (NULL);
>         if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
> -                                           child_father_bbs, inner))
> +                                           child_father_bbs, inner, cunrolli))
>           {
>             bitmap_ior_into (father_bbs, child_father_bbs);
>             bitmap_clear (child_father_bbs);
> @@ -1456,7 +1459,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>      ul = UL_NO_GROWTH;
>
>    if (canonicalize_loop_induction_variables
> -        (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer))
> +      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
>      {
>        /* If we'll continue unrolling, we need to propagate constants
>          within the new basic blocks to fold away induction variable
> @@ -1485,7 +1488,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>     size of the code does not increase.  */
>
>  static unsigned int
> -tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
> +tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer,
> +                             bool cunrolli)
>  {
>    bitmap father_bbs = BITMAP_ALLOC (NULL);
>    bool changed;
> @@ -1507,7 +1511,8 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
>
>        changed = tree_unroll_loops_completely_1 (may_increase_size,
>                                                 unroll_outer, father_bbs,
> -                                               current_loops->tree_root);
> +                                               current_loops->tree_root,
> +                                               cunrolli);

as said, you want to do

          curolli = false;

after the above since we are iterating and for a subsequent unrolling
of an outer loop
of an unrolled inner loop we _do_ want to apply the 2/3 reduction
since there's likely
inter-loop redundancies exposed (as happens in SPEC calculix for example).

Not sure if that changes any of the testsuite outcome - it possibly avoids the
gcc.dg/vect/pr69783.c FAIL?

Not sure about the arm fallout.

Richard.

>        if (changed)
>         {
>           unsigned i;
> @@ -1671,7 +1676,7 @@ pass_complete_unroll::execute (function *fun)
>    if (flag_peel_loops)
>      peeled_loops = BITMAP_ALLOC (NULL);
>    unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
> -                                                  true);
> +                                                  true, false);
>    if (peeled_loops)
>      {
>        BITMAP_FREE (peeled_loops);
> @@ -1727,7 +1732,7 @@ pass_complete_unrolli::execute (function *fun)
>    if (number_of_loops (fun) > 1)
>      {
>        scev_initialize ();
> -      ret = tree_unroll_loops_completely (optimize >= 3, false);
> +      ret = tree_unroll_loops_completely (optimize >= 3, false, true);
>        scev_finalize ();
>      }
>    loop_optimizer_finalize ();
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [V3 PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-23 11:59             ` Richard Biener
@ 2024-05-24  7:29               ` liuhongt
  2024-05-29 11:22                 ` Richard Biener
  0 siblings, 1 reply; 12+ messages in thread
From: liuhongt @ 2024-05-24  7:29 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther

Update in V3:
> Since this was about vectorization can you instead add a testcase to
> gcc.dg/vect/ and check for
> vectorization to happen?
Move to vect/pr112325.c.
>
> I believe the if (unr_insn <= 0) check can go as well.
Removed.

> as said, you want to do
>
>           curolli = false;
>
> after the above since we are iterating and for a subsequent unrolling
> of an outer loop
> of an unrolled inner loop we _do_ want to apply the 2/3 reduction
> since there's likely
> inter-loop redundancies exposed (as happens in SPEC calculix for example).
>
> Not sure if that changes any of the testsuite outcome - it possibly avoids the
> gcc.dg/vect/pr69783.c FAIL?
Yes, it avoids that, cunrolli is set to false when CHANGED is true.

> Not sure about the arm fallout.
It's the same reason as pr69783.c, there's subsequent unrolling of an outer loop
of an unrolled inner loop, and since inner loop is completely unrolled,
outer_loop->inner is false and escape from the check.
The change also fix 2 arm fallouts.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure.

The patch move the 2/3 reduction from estimated_unrolled_size to
tree_unroll_loops_completely.

gcc/ChangeLog:

	PR tree-optimization/112325
	* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
	2 / 3 loop body size reduction to ..
	(try_unroll_loop_completely): .. here, add it for the check of
	body size shrink, and the check of comparison against
	param_max_completely_peeled_insns when
	(!cunrolli ||loop->inner).
	(canonicalize_loop_induction_variables): Add new parameter
	cunrolli and pass down.
	(tree_unroll_loops_completely_1): Ditto.
	(canonicalize_induction_variables): Pass cunrolli as false to
	canonicalize_loop_induction_variables.
	(tree_unroll_loops_completely): Set cunrolli to true at
	beginning and set it to false after CHANGED is true.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/pr112325.c: New test.
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 59 ++++++++++++++++++++++++++++
 gcc/tree-ssa-loop-ivcanon.cc         | 46 +++++++++++-----------
 2 files changed, 83 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c b/gcc/testsuite/gcc.dg/vect/pr112325.c
new file mode 100644
index 00000000000..71cf4099253
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    unsigned short s;
+    __builtin_memcpy(&s, &f, sizeof(unsigned short));
+    return table_f32_f16[s];
+}
+
+typedef struct {
+    ggml_fp16_t d;
+    ggml_fp16_t m;
+    unsigned char qh[4];
+    unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+    float d;
+    float s;
+    char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = 32;
+    const int nb = n / qk;
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        unsigned qh;
+        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..216e81ef15f 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
    It is (NUNROLL + 1) * size of loop body with taking into account
    the fact that in last copy everything after exit conditional
    is dead and that some instructions will be eliminated after
-   peeling.
-
-   Loop body is likely going to simplify further, this is difficult
-   to guess, we just decrease the result by 1/3.  */
-
+   peeling.  */
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
 			 unsigned HOST_WIDE_INT nunroll)
@@ -453,10 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
     unr_insns = 0;
   unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
 
-  unr_insns = unr_insns * 2 / 3;
-  if (unr_insns <= 0)
-    unr_insns = 1;
-
   return unr_insns;
 }
 
@@ -734,7 +726,8 @@ try_unroll_loop_completely (class loop *loop,
 			    edge exit, tree niter, bool may_be_zero,
 			    enum unroll_level ul,
 			    HOST_WIDE_INT maxiter,
-			    dump_user_location_t locus, bool allow_peel)
+			    dump_user_location_t locus, bool allow_peel,
+			    bool cunrolli)
 {
   unsigned HOST_WIDE_INT n_unroll = 0;
   bool n_unroll_found = false;
@@ -847,8 +840,9 @@ try_unroll_loop_completely (class loop *loop,
 
 	  /* If the code is going to shrink, we don't need to be extra
 	     cautious on guessing if the unrolling is going to be
-	     profitable.  */
-	  if (unr_insns
+	     profitable.
+	     Move from estimated_unrolled_size to unroll small loops.  */
+	  if (unr_insns * 2 / 3
 	      /* If there is IV variable that will become constant, we
 		 save one instruction in the loop prologue we do not
 		 account otherwise.  */
@@ -919,7 +913,13 @@ try_unroll_loop_completely (class loop *loop,
 			 loop->num);
 	      return false;
 	    }
-	  else if (unr_insns
+	  /* Move 2 / 3 reduction from estimated_unrolled_size, but don't reduce
+	     unrolled size for innermost loop.
+	     1) It could increase register pressure.
+	     2) Big loop after completely unroll may not be vectorized
+	     by BB vectorizer.  */
+	  else if ((cunrolli && !loop->inner
+		    ? unr_insns : unr_insns * 2 / 3)
 		   > (unsigned) param_max_completely_peeled_insns)
 	    {
 	      if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1227,7 +1227,7 @@ try_peel_loop (class loop *loop,
 static bool
 canonicalize_loop_induction_variables (class loop *loop,
 				       bool create_iv, enum unroll_level ul,
-				       bool try_eval, bool allow_peel)
+				       bool try_eval, bool allow_peel, bool cunrolli)
 {
   edge exit = NULL;
   tree niter;
@@ -1314,7 +1314,7 @@ canonicalize_loop_induction_variables (class loop *loop,
 
   dump_user_location_t locus = find_loop_location (loop);
   if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
-				  maxiter, locus, allow_peel))
+				  maxiter, locus, allow_peel, cunrolli))
     return true;
 
   if (create_iv
@@ -1358,7 +1358,7 @@ canonicalize_induction_variables (void)
     {
       changed |= canonicalize_loop_induction_variables (loop,
 							true, UL_SINGLE_ITER,
-							true, false);
+							true, false, false);
     }
   gcc_assert (!need_ssa_update_p (cfun));
 
@@ -1392,7 +1392,7 @@ canonicalize_induction_variables (void)
 
 static bool
 tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
-				bitmap father_bbs, class loop *loop)
+				bitmap father_bbs, class loop *loop, bool cunrolli)
 {
   class loop *loop_father;
   bool changed = false;
@@ -1410,7 +1410,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
 	if (!child_father_bbs)
 	  child_father_bbs = BITMAP_ALLOC (NULL);
 	if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
-					    child_father_bbs, inner))
+					    child_father_bbs, inner, cunrolli))
 	  {
 	    bitmap_ior_into (father_bbs, child_father_bbs);
 	    bitmap_clear (child_father_bbs);
@@ -1456,7 +1456,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
     ul = UL_NO_GROWTH;
 
   if (canonicalize_loop_induction_variables
-        (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer))
+      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
     {
       /* If we'll continue unrolling, we need to propagate constants
 	 within the new basic blocks to fold away induction variable
@@ -1491,6 +1491,7 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
   bool changed;
   int iteration = 0;
   bool irred_invalidated = false;
+  bool cunrolli = true;
 
   estimate_numbers_of_iterations (cfun);
 
@@ -1507,10 +1508,12 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
 
       changed = tree_unroll_loops_completely_1 (may_increase_size,
 						unroll_outer, father_bbs,
-						current_loops->tree_root);
+						current_loops->tree_root,
+						cunrolli);
       if (changed)
 	{
 	  unsigned i;
+	  cunrolli = false;
 
 	  unloop_loops (loops_to_unloop, loops_to_unloop_nunroll,
 			edges_to_remove, loop_closed_ssa_invalidated,
@@ -1670,8 +1673,7 @@ pass_complete_unroll::execute (function *fun)
      re-peeling the same loop multiple times.  */
   if (flag_peel_loops)
     peeled_loops = BITMAP_ALLOC (NULL);
-  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, 
-						   true);
+  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true);
   if (peeled_loops)
     {
       BITMAP_FREE (peeled_loops);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [V3 PATCH] Don't reduce estimated unrolled size for innermost loop.
  2024-05-24  7:29               ` [V3 PATCH] Don't reduce estimated unrolled size for innermost loop liuhongt
@ 2024-05-29 11:22                 ` Richard Biener
  0 siblings, 0 replies; 12+ messages in thread
From: Richard Biener @ 2024-05-29 11:22 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches

On Fri, May 24, 2024 at 9:29 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Update in V3:
> > Since this was about vectorization can you instead add a testcase to
> > gcc.dg/vect/ and check for
> > vectorization to happen?
> Move to vect/pr112325.c.
> >
> > I believe the if (unr_insn <= 0) check can go as well.
> Removed.
>
> > as said, you want to do
> >
> >           curolli = false;
> >
> > after the above since we are iterating and for a subsequent unrolling
> > of an outer loop
> > of an unrolled inner loop we _do_ want to apply the 2/3 reduction
> > since there's likely
> > inter-loop redundancies exposed (as happens in SPEC calculix for example).
> >
> > Not sure if that changes any of the testsuite outcome - it possibly avoids the
> > gcc.dg/vect/pr69783.c FAIL?
> Yes, it avoids that, cunrolli is set to false when CHANGED is true.
>
> > Not sure about the arm fallout.
> It's the same reason as pr69783.c, there's subsequent unrolling of an outer loop
> of an unrolled inner loop, and since inner loop is completely unrolled,
> outer_loop->inner is false and escape from the check.
> The change also fix 2 arm fallouts.

Perfect!

> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

Can you place a comment before the

         cunrolli = false;

line indicating that we do not want to restrict subsequent outer (now
innermost) loop unrollings?

OK with such added comment.

Thanks,
Richard.

> For the innermost loop, after completely loop unroll, it will most likely
> not be able to reduce the body size to 2/3. The current 2/3 reduction
> will make some of the larger loops completely unrolled during
> cunrolli, which will then result in them not being able to be
> vectorized. It also increases the register pressure.
>
> The patch move the 2/3 reduction from estimated_unrolled_size to
> tree_unroll_loops_completely.
>
> gcc/ChangeLog:
>
>         PR tree-optimization/112325
>         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
>         2 / 3 loop body size reduction to ..
>         (try_unroll_loop_completely): .. here, add it for the check of
>         body size shrink, and the check of comparison against
>         param_max_completely_peeled_insns when
>         (!cunrolli ||loop->inner).
>         (canonicalize_loop_induction_variables): Add new parameter
>         cunrolli and pass down.
>         (tree_unroll_loops_completely_1): Ditto.
>         (canonicalize_induction_variables): Pass cunrolli as false to
>         canonicalize_loop_induction_variables.
>         (tree_unroll_loops_completely): Set cunrolli to true at
>         beginning and set it to false after CHANGED is true.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/pr112325.c: New test.
> ---
>  gcc/testsuite/gcc.dg/vect/pr112325.c | 59 ++++++++++++++++++++++++++++
>  gcc/tree-ssa-loop-ivcanon.cc         | 46 +++++++++++-----------
>  2 files changed, 83 insertions(+), 22 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/pr112325.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c b/gcc/testsuite/gcc.dg/vect/pr112325.c
> new file mode 100644
> index 00000000000..71cf4099253
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
> @@ -0,0 +1,59 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
> +
> +typedef unsigned short ggml_fp16_t;
> +static float table_f32_f16[1 << 16];
> +
> +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> +    unsigned short s;
> +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> +    return table_f32_f16[s];
> +}
> +
> +typedef struct {
> +    ggml_fp16_t d;
> +    ggml_fp16_t m;
> +    unsigned char qh[4];
> +    unsigned char qs[32 / 2];
> +} block_q5_1;
> +
> +typedef struct {
> +    float d;
> +    float s;
> +    char qs[32];
> +} block_q8_1;
> +
> +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> +    const int qk = 32;
> +    const int nb = n / qk;
> +
> +    const block_q5_1 * restrict x = vx;
> +    const block_q8_1 * restrict y = vy;
> +
> +    float sumf = 0.0;
> +
> +    for (int i = 0; i < nb; i++) {
> +        unsigned qh;
> +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> +
> +        int sumi = 0;
> +
> +        for (int j = 0; j < qk/2; ++j) {
> +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> +
> +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> +
> +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> +        }
> +
> +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> +    }
> +
> +    *s = sumf;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index bf017137260..216e81ef15f 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
>     It is (NUNROLL + 1) * size of loop body with taking into account
>     the fact that in last copy everything after exit conditional
>     is dead and that some instructions will be eliminated after
> -   peeling.
> -
> -   Loop body is likely going to simplify further, this is difficult
> -   to guess, we just decrease the result by 1/3.  */
> -
> +   peeling.  */
>  static unsigned HOST_WIDE_INT
>  estimated_unrolled_size (struct loop_size *size,
>                          unsigned HOST_WIDE_INT nunroll)
> @@ -453,10 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
>      unr_insns = 0;
>    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
>
> -  unr_insns = unr_insns * 2 / 3;
> -  if (unr_insns <= 0)
> -    unr_insns = 1;
> -
>    return unr_insns;
>  }
>
> @@ -734,7 +726,8 @@ try_unroll_loop_completely (class loop *loop,
>                             edge exit, tree niter, bool may_be_zero,
>                             enum unroll_level ul,
>                             HOST_WIDE_INT maxiter,
> -                           dump_user_location_t locus, bool allow_peel)
> +                           dump_user_location_t locus, bool allow_peel,
> +                           bool cunrolli)
>  {
>    unsigned HOST_WIDE_INT n_unroll = 0;
>    bool n_unroll_found = false;
> @@ -847,8 +840,9 @@ try_unroll_loop_completely (class loop *loop,
>
>           /* If the code is going to shrink, we don't need to be extra
>              cautious on guessing if the unrolling is going to be
> -            profitable.  */
> -         if (unr_insns
> +            profitable.
> +            Move from estimated_unrolled_size to unroll small loops.  */
> +         if (unr_insns * 2 / 3
>               /* If there is IV variable that will become constant, we
>                  save one instruction in the loop prologue we do not
>                  account otherwise.  */
> @@ -919,7 +913,13 @@ try_unroll_loop_completely (class loop *loop,
>                          loop->num);
>               return false;
>             }
> -         else if (unr_insns
> +         /* Move 2 / 3 reduction from estimated_unrolled_size, but don't reduce
> +            unrolled size for innermost loop.
> +            1) It could increase register pressure.
> +            2) Big loop after completely unroll may not be vectorized
> +            by BB vectorizer.  */
> +         else if ((cunrolli && !loop->inner
> +                   ? unr_insns : unr_insns * 2 / 3)
>                    > (unsigned) param_max_completely_peeled_insns)
>             {
>               if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1227,7 +1227,7 @@ try_peel_loop (class loop *loop,
>  static bool
>  canonicalize_loop_induction_variables (class loop *loop,
>                                        bool create_iv, enum unroll_level ul,
> -                                      bool try_eval, bool allow_peel)
> +                                      bool try_eval, bool allow_peel, bool cunrolli)
>  {
>    edge exit = NULL;
>    tree niter;
> @@ -1314,7 +1314,7 @@ canonicalize_loop_induction_variables (class loop *loop,
>
>    dump_user_location_t locus = find_loop_location (loop);
>    if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
> -                                 maxiter, locus, allow_peel))
> +                                 maxiter, locus, allow_peel, cunrolli))
>      return true;
>
>    if (create_iv
> @@ -1358,7 +1358,7 @@ canonicalize_induction_variables (void)
>      {
>        changed |= canonicalize_loop_induction_variables (loop,
>                                                         true, UL_SINGLE_ITER,
> -                                                       true, false);
> +                                                       true, false, false);
>      }
>    gcc_assert (!need_ssa_update_p (cfun));
>
> @@ -1392,7 +1392,7 @@ canonicalize_induction_variables (void)
>
>  static bool
>  tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
> -                               bitmap father_bbs, class loop *loop)
> +                               bitmap father_bbs, class loop *loop, bool cunrolli)
>  {
>    class loop *loop_father;
>    bool changed = false;
> @@ -1410,7 +1410,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>         if (!child_father_bbs)
>           child_father_bbs = BITMAP_ALLOC (NULL);
>         if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
> -                                           child_father_bbs, inner))
> +                                           child_father_bbs, inner, cunrolli))
>           {
>             bitmap_ior_into (father_bbs, child_father_bbs);
>             bitmap_clear (child_father_bbs);
> @@ -1456,7 +1456,7 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
>      ul = UL_NO_GROWTH;
>
>    if (canonicalize_loop_induction_variables
> -        (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer))
> +      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
>      {
>        /* If we'll continue unrolling, we need to propagate constants
>          within the new basic blocks to fold away induction variable
> @@ -1491,6 +1491,7 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
>    bool changed;
>    int iteration = 0;
>    bool irred_invalidated = false;
> +  bool cunrolli = true;
>
>    estimate_numbers_of_iterations (cfun);
>
> @@ -1507,10 +1508,12 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
>
>        changed = tree_unroll_loops_completely_1 (may_increase_size,
>                                                 unroll_outer, father_bbs,
> -                                               current_loops->tree_root);
> +                                               current_loops->tree_root,
> +                                               cunrolli);
>        if (changed)
>         {
>           unsigned i;
> +         cunrolli = false;
>
>           unloop_loops (loops_to_unloop, loops_to_unloop_nunroll,
>                         edges_to_remove, loop_closed_ssa_invalidated,
> @@ -1670,8 +1673,7 @@ pass_complete_unroll::execute (function *fun)
>       re-peeling the same loop multiple times.  */
>    if (flag_peel_loops)
>      peeled_loops = BITMAP_ALLOC (NULL);
> -  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
> -                                                  true);
> +  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true);
>    if (peeled_loops)
>      {
>        BITMAP_FREE (peeled_loops);
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-05-29 11:23 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-13  2:27 [PATCH] Don't reduce estimated unrolled size for innermost loop liuhongt
2024-05-13  7:40 ` Richard Biener
2024-05-15  2:14   ` Hongtao Liu
2024-05-15  9:24     ` Richard Biener
2024-05-15  9:49       ` Hongtao Liu
2024-05-21  2:35       ` Hongtao Liu
2024-05-21  6:14         ` Richard Biener
2024-05-22  5:07           ` [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli liuhongt
2024-05-23  1:55             ` Hongtao Liu
2024-05-23 11:59             ` Richard Biener
2024-05-24  7:29               ` [V3 PATCH] Don't reduce estimated unrolled size for innermost loop liuhongt
2024-05-29 11:22                 ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).