From ecde6210ff483ad899a2eeba91aa1b623c49173a Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Fri, 22 Mar 2024 19:57:45 +0800
Subject: [PATCH] vect: Support multiple lane-reducing operations for loop
 reduction [PR114440]

gcc/
	PR tree-optimization/114440
	* tree-vectorizer.h (struct _stmt_vec_info): Add a new field
	reduc_result_pos.
	(vectorizable_lane_reducing): New function declaration.
	* tree-vect-stmts.cc (vectorizable_condition): Treat the condition
	statement that is pointed by stmt_vec_info of reduction PHI as the
	real "for_reduction" statement.
	(vect_analyze_stmt): Call new function vectorizable_lane_reducing
	to analyze lane-reducing operation.
	* tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): Remove parameter
	loop_vinfo. Get input vectype from stmt_info instead of reduction PHI.
	(vect_model_reduction_cost): Remove cost computation code related to
	emulated_mixed_dot_prod.
	(vect_reduction_use_partial_vector): New function.
	(vectorizable_lane_reducing): New function.
	(vectorizable_reduction): Allow multiple lane-reducing operations in
	loop reduction. Move some original lane-reducing related code to
	vectorizable_lane_reducing, and move partial vectorization checking
	code to vect_reduction_use_partial_vector.
	(vect_transform_reduction): Extend transformation to support reduction
	statements with mixed input vectypes.

gcc/testsuite/
	PR tree-optimization/114440
	* gcc.dg/vect/vect-reduc-chain-1.c
	* gcc.dg/vect/vect-reduc-chain-2.c
	* gcc.dg/vect/vect-reduc-chain-3.c
	* gcc.dg/vect/vect-reduc-dot-slp-1.c
	* gcc.dg/vect/vect-reduc-dot-slp-2.c
---
 .../gcc.dg/vect/vect-reduc-chain-1.c          |  62 ++
 .../gcc.dg/vect/vect-reduc-chain-2.c          |  77 ++
 .../gcc.dg/vect/vect-reduc-chain-3.c          |  66 ++
 .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  97 +++
 .../gcc.dg/vect/vect-reduc-dot-slp-2.c        |  81 +++
 gcc/tree-vect-loop.cc                         | 668 ++++++++++++------
 gcc/tree-vect-stmts.cc                        |  13 +-
 gcc/tree-vectorizer.h                         |   8 +
 8 files changed, 863 insertions(+), 209 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
new file mode 100644
index 00000000000..04bfc419dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_2 char *restrict c,
+   SIGNEDNESS_2 char *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_2 char c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = BASE + i * 2;
+      d[i] = BASE + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
new file mode 100644
index 00000000000..6c803b80120
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
@@ -0,0 +1,77 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 char *restrict c,
+   SIGNEDNESS_3 char *restrict d,
+   SIGNEDNESS_4 short *restrict e,
+   SIGNEDNESS_4 short *restrict f,
+   SIGNEDNESS_1 int *restrict g)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += i + 1;
+      res += c[i] * d[i];
+      res += e[i] * f[i];
+      res += g[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 char c[N], d[N];
+  SIGNEDNESS_4 short e[N], f[N];
+  SIGNEDNESS_1 int g[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += i + 1;
+      expected += c[i] * d[i];
+      expected += e[i] * f[i];
+      expected += g[i];
+    }
+  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
new file mode 100644
index 00000000000..a41e4b176c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
@@ -0,0 +1,66 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 short *restrict c,
+   SIGNEDNESS_3 short *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      res += abs;
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 short c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 - i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      expected += abs;
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
new file mode 100644
index 00000000000..51ef4eaaed8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
@@ -0,0 +1,97 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+      res += a[8] * b[8];
+      res += a[9] * b[9];
+      res += a[10] * b[10];
+      res += a[11] * b[11];
+      res += a[12] * b[12];
+      res += a[13] * b[13];
+      res += a[14] * b[14];
+      res += a[15] * b[15];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int step = 16;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      expected += a[t + 8] * b[t + 8];
+      expected += a[t + 9] * b[t + 9];
+      expected += a[t + 10] * b[t + 10];
+      expected += a[t + 11] * b[t + 11];
+      expected += a[t + 12] * b[t + 12];
+      expected += a[t + 13] * b[t + 13];
+      expected += a[t + 14] * b[t + 14];
+      expected += a[t + 15] * b[t + 15];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c
new file mode 100644
index 00000000000..1532833c3ae
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c
@@ -0,0 +1,81 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int step = 8;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 984636edbc5..5a3339b6594 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5269,8 +5269,7 @@ have_whole_vector_shift (machine_mode mode)
    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
 
 static bool
-vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
-				 stmt_vec_info stmt_info)
+vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
@@ -5281,10 +5280,9 @@ vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
     return false;
 
-  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
-  gcc_assert (reduc_info->is_reduc_info);
+  gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
   return !directly_supported_p (DOT_PROD_EXPR,
-				STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
+				STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
 				optab_vector_mixed_sign);
 }
 
@@ -5323,8 +5321,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
-  bool emulated_mixed_dot_prod
-    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -5359,12 +5355,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 	   initial result of the data reduction, initial value of the index
 	   reduction.  */
 	prologue_stmts = 4;
-      else if (emulated_mixed_dot_prod)
-	/* We need the initial reduction value and two invariants:
-	   one that contains the minimum signed value and one that
-	   contains half of its negative.  */
-	prologue_stmts = 3;
       else
+	/* We need the initial reduction value.  */
 	prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
 					 scalar_to_vec, stmt_info, 0,
@@ -7376,6 +7368,244 @@ build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
     }
 }
 
+/* Given an operation with CODE in loop reduction path whose reduction PHI is
+   specified by REDUC_INFO, the operation has TYPE of scalar result, and its
+   input vectype is represented by VECTYPE_IN. The vectype of vectorized result
+   may be different from VECTYPE_IN, either in base type or vectype lanes,
+   lane-reducing operation is the case.  This function check if it is possible,
+   and how to perform partial vectorization on the operation in the context
+   of LOOP_VINFO.  */
+
+static void
+vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
+				   stmt_vec_info reduc_info,
+				   slp_tree slp_node, code_helper code,
+				   tree type, tree vectype_in)
+{
+  if (!LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+    return;
+
+  enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info);
+  internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
+  internal_fn cond_fn = get_conditional_internal_fn (code, type);
+
+  if (reduc_type != FOLD_LEFT_REDUCTION
+      && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
+      && (cond_fn == IFN_LAST
+	  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
+					      OPTIMIZE_FOR_SPEED)))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't operate on partial vectors because"
+			 " no conditional operation is available.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+  else if (reduc_type == FOLD_LEFT_REDUCTION
+	   && reduc_fn == IFN_LAST
+	   && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in),
+				       SSA_NAME))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			"can't operate on partial vectors because"
+			" no conditional operation is available.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+  else if (reduc_type == FOLD_LEFT_REDUCTION
+	   && internal_fn_mask_index (reduc_fn) == -1
+	   && FLOAT_TYPE_P (vectype_in)
+	   && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't operate on partial vectors because"
+			 " signed zeros cannot be preserved.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+  else
+    {
+      internal_fn mask_reduc_fn
+			= get_masked_reduction_fn (reduc_fn, vectype_in);
+      vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+      unsigned nvectors;
+
+      if (slp_node)
+	nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+      else
+	nvectors = vect_get_num_copies (loop_vinfo, vectype_in);
+
+      if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
+	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
+      else
+	vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
+    }
+}
+
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
+   Now there are three such kinds of operations: dot-prod/widen-sum/sad
+   (sum-of-absolute-differences).
+
+   For a lane-reducing operation, the loop reduction path that it lies in,
+   may contain normal operation, or other lane-reducing operation of different
+   input type size, an example as:
+
+     int sum = 0;
+     for (i)
+       {
+         ...
+         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
+         sum += w[i];                // widen-sum <vector(16) char>
+         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
+         sum += n[i];                // normal <vector(4) int>
+         ...
+       }
+
+   Vectorization factor is essentially determined by operation whose input
+   vectype has the most lanes ("vector(16) char" in the example), while we
+   need to choose input vectype with the least lanes ("vector(4) int" in the
+   example) for the reduction PHI statement.  */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+			    slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!stmt)
+    return false;
+
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+
+  if (code != DOT_PROD_EXPR && code != WIDEN_SUM_EXPR && code != SAD_EXPR)
+    return false;
+
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
+    return false;
+
+  /* Do not try to vectorize bit-precision reductions.  */
+  if (!type_has_mode_precision_p (type))
+    return false;
+
+  tree vectype_in = NULL_TREE;
+
+  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+    {
+      stmt_vec_info def_stmt_info;
+      slp_tree slp_op;
+      tree op;
+      tree vectype;
+      enum vect_def_type dt;
+
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+			       &slp_op, &dt, &vectype, &def_stmt_info))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "use not simple.\n");
+	  return false;
+	}
+
+      if (!vectype)
+	{
+	  vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+						 slp_op);
+	  if (!vectype)
+	    return false;
+	}
+
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "incompatible vector types for invariants\n");
+	  return false;
+	}
+
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+	continue;
+
+      /* There should be at most one cycle def in the stmt.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
+	return false;
+
+      /* To properly compute ncopies we are interested in the widest
+	 non-reduction input type in case we're looking at a widening
+	 accumulation that we later handle in vect transformation.  */
+      if (!vectype_in
+	  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+	      < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
+	vectype_in = vectype;
+    }
+
+  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
+
+  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+  /* TODO: Support lane-reducing operation that does not directly participate
+     in loop reduction. */
+  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+    return false;
+
+  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+     recoginized.  */
+  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+
+  /* To accommodate lane-reducing operations of mixed input vectypes, choose
+     input vectype with the least lanes for the reduction PHI statement, which
+     would result in the most ncopies for vectorized reduction results.  */
+  if (!vphi_vectype_in
+      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+	  > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
+    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
+  int ncopies_for_cost;
+
+  if (slp_node)
+    {
+      /* Now lane-reducing operations in a slp node should only come from
+	 the same loop reduction path.  */
+      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
+      ncopies_for_cost = 1;
+    }
+  else
+    {
+      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
+      gcc_assert (ncopies_for_cost >= 1);
+    }
+
+  if (vect_is_emulated_mixed_dot_prod (stmt_info))
+    {
+      /* We need extra two invariants: one that contains the minimum signed
+	 value and one that contains half of its negative.  */
+      int prologue_stmts = 2;
+      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+					scalar_to_vec, stmt_info, 0,
+					vect_prologue);
+      if (dump_enabled_p ())
+	dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+		     "extra prologue_cost = %d .\n", cost);
+
+      /* Three dot-products and a subtraction.  */
+      ncopies_for_cost *= 4;
+    }
+
+  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
+		    vect_body);
+
+  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
+				     type, vectype_in);
+
+  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+  return true;
+}
+
 /* Function vectorizable_reduction.
 
    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7441,7 +7671,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool single_defuse_cycle = false;
   bool nested_cycle = false;
   bool double_reduc = false;
-  int vec_num;
   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
   tree cond_reduc_val = NULL_TREE;
 
@@ -7522,6 +7751,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			       (gimple_bb (reduc_def_phi)->loop_father));
   unsigned reduc_chain_length = 0;
   bool only_slp_reduc_chain = true;
+  bool only_lane_reduc_code_p = true;
   stmt_info = NULL;
   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
   while (reduc_def != PHI_RESULT (reduc_def_phi))
@@ -7543,14 +7773,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	 all lanes here - even though we only will vectorize from
 	 the SLP node with live lane zero the other live lanes also
 	 need to be identified as part of a reduction to be able
-	 to skip code generation for them.  */
+	 to skip code generation for them.  For lane-reducing operation
+	 vectorizable analysis needs the reduction PHI information.  */
       if (slp_for_stmt_info)
 	{
 	  for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
 	    if (STMT_VINFO_LIVE_P (s))
 	      STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
 	}
-      else if (STMT_VINFO_LIVE_P (vdef))
+      else
 	STMT_VINFO_REDUC_DEF (def) = phi_info;
       gimple_match_op op;
       if (!gimple_extract_op (vdef->stmt, &op))
@@ -7571,9 +7802,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	      return false;
 	    }
 	}
-      else if (!stmt_info)
-	/* First non-conversion stmt.  */
-	stmt_info = vdef;
+      else
+	{
+	  /* First non-conversion stmt.  */
+	  if (!stmt_info)
+	    stmt_info = vdef;
+
+	  if (op.code != DOT_PROD_EXPR
+	      && op.code != WIDEN_SUM_EXPR
+	      && op.code != SAD_EXPR)
+	    only_lane_reduc_code_p = false;
+	}
+
       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
@@ -7647,18 +7887,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (!type_has_mode_precision_p (op.type))
     return false;
 
-  /* For lane-reducing ops we're reducing the number of reduction PHIs
-     which means the only use of that may be in the lane-reducing operation.  */
-  if (lane_reduc_code_p
-      && reduc_chain_length != 1
-      && !only_slp_reduc_chain)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "lane-reducing reduction with extra stmts.\n");
-      return false;
-    }
-
   /* All uses but the last are expected to be defined in the loop.
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
@@ -7687,9 +7915,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			     "use not simple.\n");
 	  return false;
 	}
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-	continue;
-
       /* For an IFN_COND_OP we might hit the reduction definition operand
 	 twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
@@ -7735,12 +7960,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
   if (!vectype_in)
     vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
 
-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  /* If there is a normal (non-lane-reducing) operation in the loop reduction
+     path, to ensure there will be enough copies to hold vectorized results of
+     the operation, we need set the input vectype of the reduction PHI to be
+     same as the reduction output vectype somewhere, here is a suitable place.
+     Otherwise the input vectype is set to the one with the least lanes, which
+     can only be determined in vectorizable analysis routine of lane-reducing
+     operation.  */
+  if (!only_lane_reduc_code_p)
+    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
+
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node)
 	return false;
@@ -7906,8 +8140,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
 
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
 
-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.
@@ -8181,14 +8415,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;
 
-  if (single_defuse_cycle || lane_reduc_code_p)
+  if (single_defuse_cycle && !lane_reduc_code_p)
     {
       gcc_assert (op.code != COND_EXPR);
 
-      /* 4. Supportable by target?  */
-      bool ok = true;
-
-      /* 4.1. check support for the operation in the loop
+      /* 4. check support for the operation in the loop
 
 	 This isn't necessary for the lane reduction codes, since they
 	 can only be produced by pattern matching, and it's up to the
@@ -8197,14 +8428,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	 mixed-sign dot-products can be implemented using signed
 	 dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!lane_reduc_code_p
-	  && !directly_supported_p (op.code, vectype_in, optab_vector))
+      if (!directly_supported_p (op.code, vectype_in, optab_vector))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
 	      || !vect_can_vectorize_without_simd_p (op.code))
-	    ok = false;
+	    single_defuse_cycle = false;
 	  else
 	    if (dump_enabled_p ())
 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8217,35 +8447,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
 	  return false;
 	}
-
-      /* lane-reducing operations have to go through vect_transform_reduction.
-         For the other cases try without the single cycle optimization.  */
-      if (!ok)
-	{
-	  if (lane_reduc_code_p)
-	    return false;
-	  else
-	    single_defuse_cycle = false;
-	}
     }
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
 
-  /* If the reduction stmt is one of the patterns that have lane
-     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1 && ! single_defuse_cycle)
-      && lane_reduc_code_p)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "multi def-use cycle not possible for lane-reducing "
-			 "reduction operation\n");
-      return false;
-    }
-
-  if (slp_node
-      && !(!single_defuse_cycle
-	   && !lane_reduc_code_p
-	   && reduction_type != FOLD_LEFT_REDUCTION))
+  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
+     below processing will be done in its own vectorizable function.  */
+  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
     for (i = 0; i < (int) op.num_ops; i++)
       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
 	{
@@ -8255,36 +8462,24 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  return false;
 	}
 
-  if (slp_node)
-    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-  else
-    vec_num = 1;
-
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
 			     reduction_type, ncopies, cost_vec);
   /* Cost the reduction op inside the loop if transformed via
-     vect_transform_reduction.  Otherwise this is costed by the
-     separate vectorizable_* routines.  */
-  if (single_defuse_cycle || lane_reduc_code_p)
-    {
-      int factor = 1;
-      if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
-	/* Three dot-products and a subtraction.  */
-	factor = 4;
-      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
-			stmt_info, 0, vect_body);
-    }
+     vect_transform_reduction for non-lane-reducing operation.  Otherwise
+     this is costed by the separate vectorizable_* routines.  */
+  if (single_defuse_cycle && !lane_reduc_code_p)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "using an in-order (fold-left) reduction.\n");
   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
-  /* All but single defuse-cycle optimized, lane-reducing and fold-left
-     reductions go through their own vectorizable_* routines.  */
-  if (!single_defuse_cycle
-      && !lane_reduc_code_p
-      && reduction_type != FOLD_LEFT_REDUCTION)
+
+  /* All but single defuse-cycle optimized and fold-left reductions go
+     through their own vectorizable_* routines.  */
+  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
+      || lane_reduc_code_p)
     {
       stmt_vec_info tem
 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8296,60 +8491,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
     }
-  else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
-    {
-      vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
-      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
-      internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
-
-      if (reduction_type != FOLD_LEFT_REDUCTION
-	  && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
-	  && (cond_fn == IFN_LAST
-	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
-						  OPTIMIZE_FOR_SPEED)))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "can't operate on partial vectors because"
-			     " no conditional operation is available.\n");
-	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	}
-      else if (reduction_type == FOLD_LEFT_REDUCTION
-	       && reduc_fn == IFN_LAST
-	       && !expand_vec_cond_expr_p (vectype_in,
-					   truth_type_for (vectype_in),
-					   SSA_NAME))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "can't operate on partial vectors because"
-			     " no conditional operation is available.\n");
-	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	}
-      else if (reduction_type == FOLD_LEFT_REDUCTION
-	       && internal_fn_mask_index (reduc_fn) == -1
-	       && FLOAT_TYPE_P (vectype_in)
-	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "can't operate on partial vectors because"
-			     " signed zeros cannot be preserved.\n");
-	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	}
-      else
-	{
-	  internal_fn mask_reduc_fn
-	    = get_masked_reduction_fn (reduc_fn, vectype_in);
+  else
+    vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node,
+				       op.code, op.type, vectype_in);
 
-	  if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
-	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
-				  vectype_in, 1);
-	  else
-	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-				   vectype_in, NULL);
-	}
-    }
   return true;
 }
 
@@ -8440,6 +8585,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
+  int stmt_ncopies;
   int vec_num;
 
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -8463,15 +8609,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+
+  /* Get input vectypes from the reduction PHI and the statement to be
+     transformed, these two vectypes may have different lanes when
+     lane-reducing operation is present.  */
+  if (!vectype_in)
+    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
+
+  if (!stmt_vectype_in)
+    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
 
   if (slp_node)
     {
       ncopies = 1;
+      stmt_ncopies = 1;
       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
     }
   else
     {
       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
+      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
       vec_num = 1;
     }
 
@@ -8480,14 +8639,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
-  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
-
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
+						    stmt_vectype_in);
   /* Transform.  */
-  tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
-  tree def0;
+  auto_vec<tree> vec_oprnds[3];
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8510,8 +8665,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 		      == op.ops[internal_fn_else_index ((internal_fn) code)]));
     }
 
-  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
-
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -8519,7 +8672,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
 	   reduc_index, masks, lens);
     }
 
@@ -8533,55 +8686,160 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
-  /* Get NCOPIES vector definitions for all operands except the reduction
-     definition.  */
-  if (!cond_fn_p)
+  gcc_assert (reduc_index < 3);
+
+  if (slp_node)
     {
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 single_defuse_cycle && reduc_index == 0
-			 ? NULL_TREE : op.ops[0], &vec_oprnds0,
-			 single_defuse_cycle && reduc_index == 1
-			 ? NULL_TREE : op.ops[1], &vec_oprnds1,
-			 op.num_ops == 3
-			 && !(single_defuse_cycle && reduc_index == 2)
-			 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
+
+      for (i = 0; i < (int) op.num_ops; i++)
+	vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
     }
   else
     {
-      /* For a conditional operation pass the truth type as mask
-	 vectype.  */
-      gcc_assert (single_defuse_cycle
-		  && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
-			 reduc_index == 1 ? NULL_TREE : op.ops[1],
-			 NULL_TREE, &vec_oprnds1,
-			 reduc_index == 2 ? NULL_TREE : op.ops[2],
-			 NULL_TREE, &vec_oprnds2);
-    }
+      int result_pos = 0;
+
+      /* The input vectype of the reduction PHI determines copies of
+	 vectorized def-use cycles, which might be more than effective copies
+	 of vectorized lane-reducing reduction statements.  This could be
+	 complemented by generating extra trivial pass-through copies.  For
+	 example:
+
+	   int sum = 0;
+	   for (i)
+	     {
+	       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
+	       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+	       sum += n[i];               // normal <vector(4) int>
+	     }
+
+	 The vector size is 128-bit，vectorization factor is 16.  Reduction
+	 statements would be transformed as:
+
+	   vector<4> int sum_v0 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v1 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v2 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v3 = { 0, 0, 0, 0 };
+
+	   for (i / 16)
+	     {
+	       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
+	       sum_v1 = sum_v1;  // copy
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 = sum_v0;  // copy
+	       sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
+	       sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 += n_v0[i: 0  ~ 3 ];
+	       sum_v1 += n_v1[i: 4  ~ 7 ];
+	       sum_v2 += n_v2[i: 8  ~ 11];
+	       sum_v3 += n_v3[i: 12 ~ 15];
+	     }
+
+	 Moreover, for a higher instruction parallelism in final vectorized
+	 loop, it is considered to make those effective vectorized
+	 lane-reducing statements be distributed evenly among all def-use
+	 cycles. In the above example, SADs are generated into other cycles
+	 rather than that of DOT_PROD.  */
+
+      if (stmt_ncopies < ncopies)
+	{
+	  gcc_assert (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR
+		      || code == SAD_EXPR);
+	  result_pos = reduc_info->reduc_result_pos;
+	  reduc_info->reduc_result_pos = (result_pos + stmt_ncopies) % ncopies;
+	  gcc_assert (result_pos >= 0 && result_pos < ncopies);
+	}
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+	{
+	  tree vectype = NULL_TREE;
+	  int used_ncopies = ncopies;
+
+	  if (cond_fn_p && i == 0)
+	    {
+	      /* For a conditional operation pass the truth type as mask
+		 vectype.  */
+	      gcc_assert (single_defuse_cycle && reduc_index > 0);
+	      vectype = truth_type_for (vectype_in);
+	    }
 
-  /* For single def-use cycles get one copy of the vectorized reduction
-     definition.  */
-  if (single_defuse_cycle)
-    {
-      gcc_assert (!slp_node);
-      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-				     op.ops[reduc_index],
-				     reduc_index == 0 ? &vec_oprnds0
-				     : (reduc_index == 1 ? &vec_oprnds1
-					: &vec_oprnds2));
+	  if (i != reduc_index)
+	    {
+	      /* For non-reduction operand, deduce effictive copies that are
+		 involved in vectorized def-use cycles based on the input
+		 vectype of the reduction statement.  */
+	      used_ncopies = stmt_ncopies;
+	    }
+	  else if (single_defuse_cycle)
+	    {
+	      /* For single def-use cycles get one copy of the vectorized
+		 reduction definition.  */
+	      used_ncopies = 1;
+	    }
+
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
+					 op.ops[i], &vec_oprnds[i], vectype);
+
+	  if (used_ncopies < ncopies)
+	    {
+	      vec_oprnds[i].safe_grow_cleared (ncopies);
+
+	      /* Find suitable def-use cycles to generate vectorized
+		 statements into, and reorder operands based on the
+		 selection.  */
+	      if (i != reduc_index && result_pos)
+		{
+		  int count = ncopies - used_ncopies;
+		  int start = result_pos - count;
+
+		  if (start < 0)
+		    {
+		      count = result_pos;
+		      start = 0;
+		    }
+
+		  for (int j = used_ncopies - 1; j >= start; j--)
+		    {
+		      std::swap (vec_oprnds[i][j], vec_oprnds[i][j + count]);
+		      gcc_assert (!vec_oprnds[i][j]);
+		    }
+		}
+	    }
+	}
     }
 
-  bool emulated_mixed_dot_prod
-    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
-  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  tree def0;
+
+  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
     {
       gimple *new_stmt;
-      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-      if (masked_loop_p && !mask_by_cond_expr)
+      tree new_temp = NULL_TREE;
+      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
+
+      if (!vop[0] || !vop[1])
+	{
+	  tree reduc_vop = vec_oprnds[reduc_index][i];
+
+	  /* Insert trivial copy if no need to generate vectorized
+	     statement.  */
+	  gcc_assert (reduc_vop && stmt_ncopies < ncopies);
+
+	  new_stmt = gimple_build_assign (vec_dest, reduc_vop);
+	  new_temp = make_ssa_name (vec_dest, new_stmt);
+	  gimple_set_lhs (new_stmt, new_temp);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+	}
+      else if (masked_loop_p && !mask_by_cond_expr)
 	{
-	  /* No conditional ifns have been defined for dot-product yet.  */
-	  gcc_assert (code != DOT_PROD_EXPR);
+	  /* No conditional ifns have been defined for dot-product and sad
+	     yet.  */
+	  gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
 
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)
@@ -8590,7 +8848,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	      std::swap (vop[0], vop[1]);
 	    }
 	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype_in, i);
+					  vec_num * stmt_ncopies,
+					  stmt_vectype_in, i);
 	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
 						    vop[0], vop[1], vop[0]);
 	  new_temp = make_ssa_name (vec_dest, call);
@@ -8602,12 +8861,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
 	{
 	  if (op.num_ops >= 3)
-	    vop[2] = vec_oprnds2[i];
+	    vop[2] = vec_oprnds[2][i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
 	    {
 	      tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					      vec_num * ncopies, vectype_in, i);
+					      vec_num * stmt_ncopies,
+					      stmt_vectype_in, i);
 	      build_vect_cond_expr (code, vop, mask, gsi);
 	    }
 
@@ -8634,16 +8894,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
       if (slp_node)
 	slp_node->push_vec_def (new_stmt);
-      else if (single_defuse_cycle
-	       && i < ncopies - 1)
-	{
-	  if (reduc_index == 0)
-	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 1)
-	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 2)
-	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-	}
+      else if (single_defuse_cycle && i < ncopies - 1)
+	vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
       else
 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f8d8636b139..15331ca87f2 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12093,11 +12093,20 @@ vectorizable_condition (vec_info *vinfo,
   vect_reduction_type reduction_type = TREE_CODE_REDUCTION;
   bool for_reduction
     = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
+  if (for_reduction)
+    {
+      reduc_info = info_for_reduction (vinfo, stmt_info);
+      if (STMT_VINFO_REDUC_DEF (reduc_info) != vect_orig_stmt (stmt_info))
+	{
+	  for_reduction = false;
+	  reduc_info = NULL;
+	}
+    }
+
   if (for_reduction)
     {
       if (slp_node)
 	return false;
-      reduc_info = info_for_reduction (vinfo, stmt_info);
       reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
       reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
       gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION
@@ -13273,6 +13282,8 @@ vect_analyze_stmt (vec_info *vinfo,
 				      NULL, NULL, node, cost_vec)
 	  || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+	  || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+					 stmt_info, node, cost_vec)
 	  || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
 				     node, node_instance, cost_vec)
 	  || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index db44d730b70..a923e1cd657 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1399,6 +1399,12 @@ public:
   /* The vector type for performing the actual reduction.  */
   tree reduc_vectype;
 
+  /* For loop reduction with multiple vectorized results (ncopies > 1), a
+     lane-reducing operation participating in it may not use all of those
+     results, this field specifies result index starting from which any
+     following land-reducing operation would be assigned to.  */
+  int reduc_result_pos;
+
   /* If IS_REDUC_INFO is true and if the vector code is performing
      N scalar reductions in parallel, this variable gives the initial
      scalar values of those N reductions.  */
@@ -2430,6 +2436,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
 extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
 					 slp_tree, slp_instance, int,
 					 bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+					slp_tree, stmt_vector_for_cost *);
 extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
 				    slp_tree, slp_instance,
 				    stmt_vector_for_cost *);
-- 
2.17.1