When transforming multiple lane-reducing operations in a loop reduction chain,
originally, corresponding vectorized statements are generated into def-use
cycles starting from 0. The def-use cycle with smaller index, would contain
more statements, which means more instruction dependency. For example:

   int sum = 0;
   for (i)
     {
       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
       sum += w[i];               // widen-sum <vector(16) char>
       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
     }

Original transformation result:

   for (i / 16)
     {
       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy
     }

For a higher instruction parallelism in final vectorized loop, an optimal
means is to make those effective vectorized lane-reducing statements be
distributed evenly among all def-use cycles. Transformed as the below,
DOT_PROD, WIDEN_SUM and SADs are generated into disparate cycles,
instruction dependency could be eliminated.

   for (i / 16)
     {
       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = sum_v0;  // copy
       sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = sum_v0;  // copy
       sum_v1 = sum_v1;  // copy
       sum_v2 = SAD (s0_v2[i: 0 ~ 7 ], s1_v2[i: 0 ~ 7 ], sum_v2);
       sum_v3 = SAD (s0_v3[i: 8 ~ 15], s1_v3[i: 8 ~ 15], sum_v3);
     }

Thanks,
Feng
---
gcc/
	PR tree-optimization/114440
	* tree-vectorizer.h (struct _stmt_vec_info): Add a new field
	reduc_result_pos.
	* tree-vect-loop.cc (vect_transform_reduction): Generate lane-reducing
	statements in an optimized order.
---
 gcc/tree-vect-loop.cc | 51 ++++++++++++++++++++++++++++++++++++++-----
 gcc/tree-vectorizer.h |  6 +++++
 2 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b5849dbb08a..4807f529506 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8703,7 +8703,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
-  gcc_assert (single_defuse_cycle || lane_reducing_op_p (code));
+  bool lane_reducing = lane_reducing_op_p (code);
+  gcc_assert (single_defuse_cycle || lane_reducing);
 
   /* Create the destination vector  */
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
@@ -8720,6 +8721,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
     }
   else
     {
+      int result_pos = 0;
+
       /* The input vectype of the reduction PHI determines copies of
 	 vectorized def-use cycles, which might be more than effective copies
 	 of vectorized lane-reducing reduction statements.  This could be
@@ -8749,9 +8752,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	       sum_v2 = sum_v2;  // copy
 	       sum_v3 = sum_v3;  // copy
 
-	       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
-	       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
-	       sum_v2 = sum_v2;  // copy
+	       sum_v0 = sum_v0;  // copy
+	       sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
+	       sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
 	       sum_v3 = sum_v3;  // copy
 
 	       sum_v0 += n_v0[i: 0  ~ 3 ];
@@ -8759,7 +8762,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	       sum_v2 += n_v2[i: 8  ~ 11];
 	       sum_v3 += n_v3[i: 12 ~ 15];
 	     }
-	*/
+
+	 Moreover, for a higher instruction parallelism in final vectorized
+	 loop, it is considered to make those effective vectorized
+	 lane-reducing statements be distributed evenly among all def-use
+	 cycles. In the above example, SADs are generated into other cycles
+	 rather than that of DOT_PROD.  */
+
+      if (stmt_ncopies < ncopies)
+	{
+	  gcc_assert (lane_reducing);
+	  result_pos = reduc_info->reduc_result_pos;
+	  reduc_info->reduc_result_pos = (result_pos + stmt_ncopies) % ncopies;
+	  gcc_assert (result_pos >= 0 && result_pos < ncopies);
+	}
 
       for (i = 0; i < MIN (3, (int) op.num_ops); i++)
 	{
@@ -8792,7 +8808,30 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 					 op.ops[i], &vec_oprnds[i], vectype);
 
 	  if (used_ncopies < ncopies)
-	    vec_oprnds[i].safe_grow_cleared (ncopies);
+	    {
+	      vec_oprnds[i].safe_grow_cleared (ncopies);
+
+	      /* Find suitable def-use cycles to generate vectorized
+		 statements into, and reorder operands based on the
+		 selection.  */
+	      if (i != reduc_index && result_pos)
+		{
+		  int count = ncopies - used_ncopies;
+		  int start = result_pos - count;
+
+		  if (start < 0)
+		    {
+		      count = result_pos;
+		      start = 0;
+		    }
+
+		  for (int j = used_ncopies - 1; j >= start; j--)
+		    {
+		      std::swap (vec_oprnds[i][j], vec_oprnds[i][j + count]);
+		      gcc_assert (!vec_oprnds[i][j]);
+		    }
+		}
+	    }
 	}
     }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index ca810869592..d64729ac953 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1402,6 +1402,12 @@ public:
   /* The vector type for performing the actual reduction.  */
   tree reduc_vectype;
 
+  /* For loop reduction with multiple vectorized results (ncopies > 1), a
+     lane-reducing operation participating in it may not use all of those
+     results, this field specifies result index starting from which any
+     following land-reducing operation would be assigned to.  */
+  int reduc_result_pos;
+
   /* If IS_REDUC_INFO is true and if the vector code is performing
      N scalar reductions in parallel, this variable gives the initial
      scalar values of those N reductions.  */
-- 
2.17.1