Updated the patch for single-lane slp node support. And determine input vectype for reduction PHI during traversal of reduction statements.

Thanks,
Feng
---

gcc/
        PR tree-optimization/114440
        * tree-vectorizer.h (vectorizable_lane_reducing): New function
        declaration.
        * tree-vect-stmts.cc (vect_analyze_stmt): Call new function
        vectorizable_lane_reducing to analyze lane-reducing operation.
        * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
        code related to emulated_mixed_dot_prod.
        (vect_reduction_update_partial_vector_usage): Compute ncopies as the
        original means for single-lane slp node.
        (vectorizable_lane_reducing): New function.
        (vectorizable_reduction): Allow multiple lane-reducing operations in
        loop reduction. Move some original lane-reducing related code to
        vectorizable_lane_reducing.
        (vect_transform_reduction): Extend transformation to support reduction
        statements with mixed input vectypes.

gcc/testsuite/
        PR tree-optimization/114440
        * gcc.dg/vect/vect-reduc-chain-1.c
        * gcc.dg/vect/vect-reduc-chain-2.c
        * gcc.dg/vect/vect-reduc-chain-3.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
        * gcc.dg/vect/vect-reduc-dot-slp-1.c

temp
---
 .../gcc.dg/vect/vect-reduc-chain-1.c          |  62 +++
 .../gcc.dg/vect/vect-reduc-chain-2.c          |  77 +++
 .../gcc.dg/vect/vect-reduc-chain-3.c          |  66 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  95 ++++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  67 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c  |  79 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c  |  63 +++
 .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  35 ++
 gcc/tree-vect-loop.cc                         | 501 ++++++++++++------
 gcc/tree-vect-stmts.cc                        |   2 +
 gcc/tree-vectorizer.h                         |   2 +
 11 files changed, 888 insertions(+), 161 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
new file mode 100644
index 00000000000..04bfc419dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_2 char *restrict c,
+   SIGNEDNESS_2 char *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_2 char c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = BASE + i * 2;
+      d[i] = BASE + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
new file mode 100644
index 00000000000..6c803b80120
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
@@ -0,0 +1,77 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 char *restrict c,
+   SIGNEDNESS_3 char *restrict d,
+   SIGNEDNESS_4 short *restrict e,
+   SIGNEDNESS_4 short *restrict f,
+   SIGNEDNESS_1 int *restrict g)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += i + 1;
+      res += c[i] * d[i];
+      res += e[i] * f[i];
+      res += g[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 char c[N], d[N];
+  SIGNEDNESS_4 short e[N], f[N];
+  SIGNEDNESS_1 int g[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += i + 1;
+      expected += c[i] * d[i];
+      expected += e[i] * f[i];
+      expected += g[i];
+    }
+  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
new file mode 100644
index 00000000000..a41e4b176c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
@@ -0,0 +1,66 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 short *restrict c,
+   SIGNEDNESS_3 short *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      res += abs;
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 short c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 - i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      expected += abs;
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
new file mode 100644
index 00000000000..c2831fbcc8e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
@@ -0,0 +1,95 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+      res += a[8] * b[8];
+      res += a[9] * b[9];
+      res += a[10] * b[10];
+      res += a[11] * b[11];
+      res += a[12] * b[12];
+      res += a[13] * b[13];
+      res += a[14] * b[14];
+      res += a[15] * b[15];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int step = 16;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      expected += a[t + 8] * b[t + 8];
+      expected += a[t + 9] * b[t + 9];
+      expected += a[t + 10] * b[t + 10];
+      expected += a[t + 11] * b[t + 11];
+      expected += a[t + 12] * b[t + 12];
+      expected += a[t + 13] * b[t + 13];
+      expected += a[t + 14] * b[t + 14];
+      expected += a[t + 15] * b[t + 15];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
new file mode 100644
index 00000000000..4114264a364
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
@@ -0,0 +1,67 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[5 * i + 0] * b[5 * i + 0];
+      res += a[5 * i + 1] * b[5 * i + 1];
+      res += a[5 * i + 2] * b[5 * i + 2];
+      res += a[5 * i + 3] * b[5 * i + 3];
+      res += a[5 * i + 4] * b[5 * i + 4];
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int n = 18;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[5 * i + 0] * b[5 * i + 0];
+      expected += a[5 * i + 1] * b[5 * i + 1];
+      expected += a[5 * i + 2] * b[5 * i + 2];
+      expected += a[5 * i + 3] * b[5 * i + 3];
+      expected += a[5 * i + 4] * b[5 * i + 4];
+    }
+
+  if (f (0x12345, a, b, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 5 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
new file mode 100644
index 00000000000..2cdecc36d16
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
@@ -0,0 +1,79 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int step = 8;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
new file mode 100644
index 00000000000..32c0f30c77b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
@@ -0,0 +1,63 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[3 * i + 0] * b[3 * i + 0];
+      res += a[3 * i + 1] * b[3 * i + 1];
+      res += a[3 * i + 2] * b[3 * i + 2];
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int n = 18;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[3 * i + 0] * b[3 * i + 0];
+      expected += a[3 * i + 1] * b[3 * i + 1];
+      expected += a[3 * i + 2] * b[3 * i + 2];
+    }
+
+  if (f (0x12345, a, b, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 3 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
new file mode 100644
index 00000000000..e17d6291f75
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
@@ -0,0 +1,35 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res0,
+   SIGNEDNESS_1 int res1,
+   SIGNEDNESS_1 int res2,
+   SIGNEDNESS_1 int res3,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b)
+{
+  for (int i = 0; i < 64; i += 4)
+    {
+      res0 += a[i + 0] * b[i + 0];
+      res1 += a[i + 1] * b[i + 1];
+      res2 += a[i + 2] * b[i + 2];
+      res3 += a[i + 3] * b[i + 3];
+    }
+
+  return res0 ^ res1 ^ res2 ^ res3;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 35c50eb72cb..fb9259d115c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5324,8 +5324,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();

-  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -5360,12 +5358,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
           initial result of the data reduction, initial value of the index
           reduction.  */
        prologue_stmts = 4;
-      else if (emulated_mixed_dot_prod)
-       /* We need the initial reduction value and two invariants:
-          one that contains the minimum signed value and one that
-          contains half of its negative.  */
-       prologue_stmts = 3;
       else
+       /* We need the initial reduction value.  */
        prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
                                         scalar_to_vec, stmt_info, 0,
@@ -7466,7 +7460,7 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
       unsigned nvectors;

-      if (slp_node)
+      if (slp_node && SLP_TREE_LANES (slp_node) > 1)
        nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       else
        nvectors = vect_get_num_copies (loop_vinfo, vectype_in);
@@ -7478,6 +7472,150 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
     }
 }

+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
+   Now there are three such kinds of operations: dot-prod/widen-sum/sad
+   (sum-of-absolute-differences).
+
+   For a lane-reducing operation, the loop reduction path that it lies in,
+   may contain normal operation, or other lane-reducing operation of different
+   input type size, an example as:
+
+     int sum = 0;
+     for (i)
+       {
+         ...
+         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
+         sum += w[i];                // widen-sum <vector(16) char>
+         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
+         sum += n[i];                // normal <vector(4) int>
+         ...
+       }
+
+   Vectorization factor is essentially determined by operation whose input
+   vectype has the most lanes ("vector(16) char" in the example), while we
+   need to choose input vectype with the least lanes ("vector(4) int" in the
+   example) for the reduction PHI statement.  */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!stmt)
+    return false;
+
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+
+  if (!lane_reducing_op_p (code))
+    return false;
+
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
+    return false;
+
+  /* Do not try to vectorize bit-precision reductions.  */
+  if (!type_has_mode_precision_p (type))
+    return false;
+
+  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+    {
+      stmt_vec_info def_stmt_info;
+      slp_tree slp_op;
+      tree op;
+      tree vectype;
+      enum vect_def_type dt;
+
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+                              &slp_op, &dt, &vectype, &def_stmt_info))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "use not simple.\n");
+         return false;
+       }
+
+      if (!vectype)
+       {
+         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+                                                slp_op);
+         if (!vectype)
+           return false;
+       }
+
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
+
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+       continue;
+
+      /* There should be at most one cycle def in the stmt.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
+       return false;
+    }
+
+  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+  /* TODO: Support lane-reducing operation that does not directly participate
+     in loop reduction. */
+  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+    return false;
+
+  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+     recoginized.  */
+  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+  int ncopies_for_cost;
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
+    {
+      /* Now lane-reducing operations in a non-single-lane slp node should only
+        come from the same loop reduction path.  */
+      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
+      ncopies_for_cost = 1;
+    }
+  else
+    {
+      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
+      gcc_assert (ncopies_for_cost >= 1);
+    }
+
+  if (vect_is_emulated_mixed_dot_prod (stmt_info))
+    {
+      /* We need extra two invariants: one that contains the minimum signed
+        value and one that contains half of its negative.  */
+      int prologue_stmts = 2;
+      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+                                       scalar_to_vec, stmt_info, 0,
+                                       vect_prologue);
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+                    "extra prologue_cost = %d .\n", cost);
+
+      /* Three dot-products and a subtraction.  */
+      ncopies_for_cost *= 4;
+    }
+
+  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
+                   vect_body);
+
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+    vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
+                                               slp_node, code, type,
+                                               vectype_in);
+
+  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+  return true;
+}
+
 /* Function vectorizable_reduction.

    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7643,7 +7781,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
-      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
+      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
+
+      if (reduc_idx == -1)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7689,10 +7829,43 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
              return false;
            }
        }
-      else if (!stmt_info)
-       /* First non-conversion stmt.  */
-       stmt_info = vdef;
-      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
+      else
+       {
+         /* First non-conversion stmt.  */
+         if (!stmt_info)
+           stmt_info = vdef;
+
+         if (lane_reducing_op_p (op.code))
+           {
+             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
+             tree op_type = TREE_TYPE (op.ops[0]);
+             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
+                                                                op_type,
+                                                                group_size);
+
+             /* The last operand of lane-reducing operation must be addend
+                for reduction.  */
+             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
+
+             if (!new_vectype_in)
+               return false;
+
+             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
+
+             /* To accommodate lane-reducing operations of mixed input
+                vectypes, choose input vectype with the least lanes for the
+                reduction PHI statement, which would result in the most
+                ncopies for vectorized reduction results.  */
+             if (!vectype_in
+                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
+               vectype_in = new_vectype_in;
+           }
+         else
+           vectype_in = STMT_VINFO_VECTYPE (phi_info);
+       }
+
+      reduc_def = op.ops[reduc_idx];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
        slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
@@ -7750,6 +7923,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,

   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7763,18 +7938,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (!type_has_mode_precision_p (op.type))
     return false;

-  /* For lane-reducing ops we're reducing the number of reduction PHIs
-     which means the only use of that may be in the lane-reducing operation.  */
-  if (lane_reducing
-      && reduc_chain_length != 1
-      && !only_slp_reduc_chain)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "lane-reducing reduction with extra stmts.\n");
-      return false;
-    }
-
   /* Lane-reducing ops also never can be used in a SLP reduction group
      since we'll mix lanes belonging to different reductions.  But it's
      OK to use them in a reduction chain or when the reduction group
@@ -7818,9 +7981,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
                             "use not simple.\n");
          return false;
        }
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-       continue;
-
       /* For an IFN_COND_OP we might hit the reduction definition operand
         twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
@@ -7836,16 +7996,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
          = get_vectype_for_scalar_type (loop_vinfo,
                                         TREE_TYPE (op.ops[i]), slp_op[i]);

-      /* To properly compute ncopies we are interested in the widest
-        non-reduction input type in case we're looking at a widening
-        accumulation that we later handle in vect_transform_reduction.  */
-      if (lane_reducing
-         && vectype_op[i]
-         && (!vectype_in
-             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
-       vectype_in = vectype_op[i];
-
       /* Record how the non-reduction-def value of COND_EXPR is defined.
         ???  For a chain of multiple CONDs we'd have to match them up all.  */
       if (op.code == COND_EXPR && reduc_chain_length == 1)
@@ -7864,19 +8014,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
            }
        }
     }
-  if (!vectype_in)
-    vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
-
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;

-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node && SLP_TREE_LANES (slp_node) != 1)
        return false;
@@ -8042,8 +8184,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }

   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);

-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.
@@ -8329,14 +8471,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;

-  if (single_defuse_cycle || lane_reducing)
+  if (single_defuse_cycle && !lane_reducing)
     {
       gcc_assert (op.code != COND_EXPR);

-      /* 4. Supportable by target?  */
-      bool ok = true;
-
-      /* 4.1. check support for the operation in the loop
+      /* 4. check support for the operation in the loop

         This isn't necessary for the lane reduction codes, since they
         can only be produced by pattern matching, and it's up to the
@@ -8345,14 +8484,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
         mixed-sign dot-products can be implemented using signed
         dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!lane_reducing
-         && !directly_supported_p (op.code, vectype_in, optab_vector))
+      if (!directly_supported_p (op.code, vectype_in, optab_vector))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
          if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
              || !vect_can_vectorize_without_simd_p (op.code))
-           ok = false;
+           single_defuse_cycle = false;
          else
            if (dump_enabled_p ())
              dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8365,35 +8503,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
            dump_printf (MSG_NOTE, "using word mode not possible.\n");
          return false;
        }
-
-      /* lane-reducing operations have to go through vect_transform_reduction.
-         For the other cases try without the single cycle optimization.  */
-      if (!ok)
-       {
-         if (lane_reducing)
-           return false;
-         else
-           single_defuse_cycle = false;
-       }
     }
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;

-  /* If the reduction stmt is one of the patterns that have lane
-     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1 && ! single_defuse_cycle)
-      && lane_reducing)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "multi def-use cycle not possible for lane-reducing "
-                        "reduction operation\n");
-      return false;
-    }
-
-  if (slp_node
-      && !(!single_defuse_cycle
-          && !lane_reducing
-          && reduction_type != FOLD_LEFT_REDUCTION))
+  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
+     below processing will be done in its own vectorizable function.  */
+  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
     for (i = 0; i < (int) op.num_ops; i++)
       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
        {
@@ -8406,28 +8521,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
                             reduction_type, ncopies, cost_vec);
   /* Cost the reduction op inside the loop if transformed via
-     vect_transform_reduction.  Otherwise this is costed by the
-     separate vectorizable_* routines.  */
-  if (single_defuse_cycle || lane_reducing)
-    {
-      int factor = 1;
-      if (vect_is_emulated_mixed_dot_prod (stmt_info))
-       /* Three dot-products and a subtraction.  */
-       factor = 4;
-      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
-                       stmt_info, 0, vect_body);
-    }
+     vect_transform_reduction for non-lane-reducing operation.  Otherwise
+     this is costed by the separate vectorizable_* routines.  */
+  if (single_defuse_cycle && !lane_reducing)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);

   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
                     "using an in-order (fold-left) reduction.\n");
   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
-  /* All but single defuse-cycle optimized, lane-reducing and fold-left
-     reductions go through their own vectorizable_* routines.  */
-  if (!single_defuse_cycle
-      && !lane_reducing
-      && reduction_type != FOLD_LEFT_REDUCTION)
+
+  /* All but single defuse-cycle optimized and fold-left reductions go
+     through their own vectorizable_* routines.  */
+  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
+      || lane_reducing)
     {
       stmt_vec_info tem
        = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8533,6 +8641,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
+  int stmt_ncopies;
   int vec_num;

   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -8556,15 +8665,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);

-  if (slp_node)
+  /* Get input vectypes from the reduction PHI and the statement to be
+     transformed, these two vectypes may have different lanes when
+     lane-reducing operation is present.  */
+  if (!vectype_in)
+    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
+
+  if (!stmt_vectype_in)
+    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
     {
       ncopies = 1;
+      stmt_ncopies = 1;
       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
     }
   else
     {
       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
+      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
       vec_num = 1;
     }

@@ -8573,14 +8695,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,

   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
-  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
-
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
+                                                   stmt_vectype_in);
   /* Transform.  */
-  tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
-  tree def0;
+  auto_vec<tree> vec_oprnds[3];

   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8604,8 +8722,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
                      == op.ops[internal_fn_else_index ((internal_fn) code)]));
     }

-  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
-
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -8613,7 +8729,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
          (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-          code, reduc_fn, op.ops, op.num_ops, vectype_in,
+          code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
           reduc_index, masks, lens);
     }

@@ -8624,55 +8740,124 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);

-  /* Get NCOPIES vector definitions for all operands except the reduction
-     definition.  */
-  if (!cond_fn_p)
+  gcc_assert (reduc_index < 3);
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
     {
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-                        single_defuse_cycle && reduc_index == 0
-                        ? NULL_TREE : op.ops[0], &vec_oprnds0,
-                        single_defuse_cycle && reduc_index == 1
-                        ? NULL_TREE : op.ops[1], &vec_oprnds1,
-                        op.num_ops == 3
-                        && !(single_defuse_cycle && reduc_index == 2)
-                        ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+      gcc_assert (!single_defuse_cycle);
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
     }
   else
     {
-      /* For a conditional operation pass the truth type as mask
-        vectype.  */
-      gcc_assert (single_defuse_cycle
-                 && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-                        op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
-                        reduc_index == 1 ? NULL_TREE : op.ops[1],
-                        NULL_TREE, &vec_oprnds1,
-                        reduc_index == 2 ? NULL_TREE : op.ops[2],
-                        NULL_TREE, &vec_oprnds2);
-    }
+      /* The input vectype of the reduction PHI determines copies of
+        vectorized def-use cycles, which might be more than effective copies
+        of vectorized lane-reducing reduction statements.  This could be
+        complemented by generating extra trivial pass-through copies.  For
+        example:
+
+          int sum = 0;
+          for (i)
+            {
+              sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
+              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+              sum += n[i];               // normal <vector(4) int>
+            }
+
+        The vector size is 128-bit?vectorization factor is 16.  Reduction
+        statements would be transformed as:
+
+          vector<4> int sum_v0 = { 0, 0, 0, 0 };
+          vector<4> int sum_v1 = { 0, 0, 0, 0 };
+          vector<4> int sum_v2 = { 0, 0, 0, 0 };
+          vector<4> int sum_v3 = { 0, 0, 0, 0 };
+
+          for (i / 16)
+            {
+              sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
+              sum_v1 = sum_v1;  // copy
+              sum_v2 = sum_v2;  // copy
+              sum_v3 = sum_v3;  // copy
+
+              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+              sum_v2 = sum_v2;  // copy
+              sum_v3 = sum_v3;  // copy
+
+              sum_v0 += n_v0[i: 0  ~ 3 ];
+              sum_v1 += n_v1[i: 4  ~ 7 ];
+              sum_v2 += n_v2[i: 8  ~ 11];
+              sum_v3 += n_v3[i: 12 ~ 15];
+            }
+       */
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+       {
+         tree vectype = NULL_TREE;
+         int used_ncopies = ncopies;
+
+         if (cond_fn_p && i == 0)
+           {
+             /* For a conditional operation pass the truth type as mask
+                vectype.  */
+             gcc_assert (single_defuse_cycle && reduc_index > 0);
+             vectype = truth_type_for (vectype_in);
+           }

-  /* For single def-use cycles get one copy of the vectorized reduction
-     definition.  */
-  if (single_defuse_cycle)
-    {
-      gcc_assert (!slp_node);
-      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-                                    op.ops[reduc_index],
-                                    reduc_index == 0 ? &vec_oprnds0
-                                    : (reduc_index == 1 ? &vec_oprnds1
-                                       : &vec_oprnds2));
+         if (i != reduc_index)
+           {
+             /* For non-reduction operand, deduce effictive copies that are
+                involved in vectorized def-use cycles based on the input
+                vectype of the reduction statement.  */
+             used_ncopies = stmt_ncopies;
+           }
+         else if (single_defuse_cycle)
+           {
+             /* For single def-use cycles get one copy of the vectorized
+                reduction definition.  */
+             used_ncopies = 1;
+           }
+
+         if (slp_node)
+           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
+         else
+           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
+                                          op.ops[i], &vec_oprnds[i], vectype);
+
+         if (used_ncopies < ncopies)
+           vec_oprnds[i].safe_grow_cleared (ncopies);
+       }
     }

+  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  tree def0;

-  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
     {
       gimple *new_stmt;
-      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-      if (masked_loop_p && !mask_by_cond_expr)
+      tree new_temp = NULL_TREE;
+      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
+
+      if (!vop[0] || !vop[1])
+       {
+         tree reduc_vop = vec_oprnds[reduc_index][i];
+
+         /* Insert trivial copy if no need to generate vectorized
+            statement.  */
+         gcc_assert (reduc_vop && stmt_ncopies < ncopies);
+
+         new_stmt = gimple_build_assign (vec_dest, reduc_vop);
+         new_temp = make_ssa_name (vec_dest, new_stmt);
+         gimple_set_lhs (new_stmt, new_temp);
+         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+       }
+      else if (masked_loop_p && !mask_by_cond_expr)
        {
-         /* No conditional ifns have been defined for dot-product yet.  */
-         gcc_assert (code != DOT_PROD_EXPR);
+         /* No conditional ifns have been defined for dot-product and sad
+            yet.  */
+         gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);

          /* Make sure that the reduction accumulator is vop[0].  */
          if (reduc_index == 1)
@@ -8681,7 +8866,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
              std::swap (vop[0], vop[1]);
            }
          tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-                                         vec_num * ncopies, vectype_in, i);
+                                         vec_num * stmt_ncopies,
+                                         stmt_vectype_in, i);
          gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
                                                    vop[0], vop[1], vop[0]);
          new_temp = make_ssa_name (vec_dest, call);
@@ -8693,12 +8879,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
        {
          if (op.num_ops >= 3)
-           vop[2] = vec_oprnds2[i];
+           vop[2] = vec_oprnds[2][i];

          if (masked_loop_p && mask_by_cond_expr)
            {
              tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-                                             vec_num * ncopies, vectype_in, i);
+                                             vec_num * stmt_ncopies,
+                                             stmt_vectype_in, i);
              build_vect_cond_expr (code, vop, mask, gsi);
            }

@@ -8725,16 +8912,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,

       if (slp_node)
        slp_node->push_vec_def (new_stmt);
-      else if (single_defuse_cycle
-              && i < ncopies - 1)
-       {
-         if (reduc_index == 0)
-           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-         else if (reduc_index == 1)
-           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-         else if (reduc_index == 2)
-           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-       }
+      else if (single_defuse_cycle && i < ncopies - 1)
+       vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
       else
        STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index dbdb59054e0..81036235a27 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13357,6 +13357,8 @@ vect_analyze_stmt (vec_info *vinfo,
                                      NULL, NULL, node, cost_vec)
          || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+                                        stmt_info, node, cost_vec)
          || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
                                     node, node_instance, cost_vec)
          || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6bb0f5c3a56..3f7db707d97 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
 extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
                                         slp_tree, slp_instance, int,
                                         bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+                                       slp_tree, stmt_vector_for_cost *);
 extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
                                    slp_tree, slp_instance,
                                    stmt_vector_for_cost *);
--
2.17.1

________________________________________
From: Richard Biener <richard.guenther@gmail.com>
Sent: Tuesday, June 4, 2024 9:17 PM
To: Feng Xue OS
Cc: Tamar Christina; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]

On Sun, Jun 2, 2024 at 4:13?PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> Please see my comments below.
>
> Thanks,
> Feng
>
> > On Thu, May 30, 2024 at 4:55?PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
> >>
> >> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
> >> vectorizer could only handle the pattern if the reduction chain does not
> >> contain other operation, no matter the other is normal or lane-reducing.
> >>
> >> Actually, to allow multiple arbitray lane-reducing operations, we need to
> >> support vectorization of loop reduction chain with mixed input vectypes. Since
> >> lanes of vectype may vary with operation, the effective ncopies of vectorized
> >> statements for operation also may not be same to each other, this causes
> >> mismatch on vectorized def-use cycles. A simple way is to align all operations
> >> with the one that has the most ncopies, the gap could be complemented by
> >> generating extra trival pass-through copies. For example:
> >>
> >>    int sum = 0;
> >>    for (i)
> >>      {
> >>        sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> >>        sum += w[i];               // widen-sum <vector(16) char>
> >>        sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
> >>        sum += n[i];               // normal <vector(4) int>
> >>      }
> >>
> >> The vector size is 128-bit?vectorization factor is 16. Reduction statements
> >> would be transformed as:
> >>
> >>    vector<4> int sum_v0 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v1 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v2 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v3 = { 0, 0, 0, 0 };
> >>
> >>    for (i / 16)
> >>      {
> >>        sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> >>        sum_v1 = sum_v1;  // copy
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
> >>        sum_v1 = sum_v1;  // copy
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> >>        sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 += n_v0[i: 0  ~ 3 ];
> >>        sum_v1 += n_v1[i: 4  ~ 7 ];
> >>        sum_v2 += n_v2[i: 8  ~ 11];
> >>        sum_v3 += n_v3[i: 12 ~ 15];
> >>      }
> >>
> >> Thanks,
> >> Feng
> >>
> >> ...
> >>
> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >> index 20c99f11e9a..b5849dbb08a 100644
> >> --- a/gcc/tree-vect-loop.cc
> >> +++ b/gcc/tree-vect-loop.cc
> >> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
> >>      gcc_unreachable ();
> >>
> >> -  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> >> -
> >>    if (reduction_type == EXTRACT_LAST_REDUCTION)
> >>      /* No extra instructions are needed in the prologue.  The loop body
> >>         operations are costed in vectorizable_condition.  */
> >> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>            initial result of the data reduction, initial value of the index
> >>            reduction.  */
> >>         prologue_stmts = 4;
> >> -      else if (emulated_mixed_dot_prod)
> >> -       /* We need the initial reduction value and two invariants:
> >> -          one that contains the minimum signed value and one that
> >> -          contains half of its negative.  */
> >> -       prologue_stmts = 3;
> >>        else
> >> +       /* We need the initial reduction value.  */
> >>         prologue_stmts = 1;
> >>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
> >>                                          scalar_to_vec, stmt_info, 0,
> >> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
> >>      }
> >>  }
> >>
> >> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
> >> +   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
> >> +   Now there are three such kinds of operations: dot-prod/widen-sum/sad
> >> +   (sum-of-absolute-differences).
> >> +
> >> +   For a lane-reducing operation, the loop reduction path that it lies in,
> >> +   may contain normal operation, or other lane-reducing operation of different
> >> +   input type size, an example as:
> >> +
> >> +     int sum = 0;
> >> +     for (i)
> >> +       {
> >> +         ...
> >> +         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
> >> +         sum += w[i];                // widen-sum <vector(16) char>
> >> +         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
> >> +         sum += n[i];                // normal <vector(4) int>
> >> +         ...
> >> +       }
> >> +
> >> +   Vectorization factor is essentially determined by operation whose input
> >> +   vectype has the most lanes ("vector(16) char" in the example), while we
> >> +   need to choose input vectype with the least lanes ("vector(4) int" in the
> >> +   example) for the reduction PHI statement.  */
> >> +
> >> +bool
> >> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
> >> +                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> >> +{
> >> +  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
> >> +  if (!stmt)
> >> +    return false;
> >> +
> >> +  enum tree_code code = gimple_assign_rhs_code (stmt);
> >> +
> >> +  if (!lane_reducing_op_p (code))
> >> +    return false;
> >
> > Can you make sure to return false if STMT_VINFO_REDUC_IDX == -1
> > thus the op is not part of a reduction chain/path?
> >
>
> As I planed, in the 2nd stage patches WIP, this function will also handle
> lane-reducing operation that does not directly participate reduction, like:
>
>  temp = dot_prod1 + dot_prod2;
>  sum += temp;
>
> In this case, STMT_VINFO_REDUC_IDX of dot_prod1/2 == -1
>
> For current work, the check is needed to filter out non-reduction statement,
> but since it is expected to be removed later, so the check is placed at a late
> point.
>
> >> +  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
> >> +
> >> +  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
> >> +    return false;
> >> +
> >> +  /* Do not try to vectorize bit-precision reductions.  */
> >> +  if (!type_has_mode_precision_p (type))
> >> +    return false;
> >> +
> >> +  tree vectype_in = NULL_TREE;
> >> +
> >> +  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
> >> +    {
> >> +      stmt_vec_info def_stmt_info;
> >> +      slp_tree slp_op;
> >> +      tree op;
> >> +      tree vectype;
> >> +      enum vect_def_type dt;
> >> +
> >> +      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
> >> +                              &slp_op, &dt, &vectype, &def_stmt_info))
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> +                            "use not simple.\n");
> >> +         return false;
> >> +       }
> >> +
> >> +      if (!vectype)
> >> +       {
> >> +         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
> >> +                                                slp_op);
> >> +         if (!vectype)
> >> +           return false;
> >> +       }
> >> +
> >> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
> >
> > Please avoid this during transform.
>
> This function is only for analysis not transform.
>
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> +                            "incompatible vector types for invariants\n");
> >> +         return false;
> >> +       }
> >> +
> >> +      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> >> +       continue;
> >> +
> >> +      /* There should be at most one cycle def in the stmt.  */
> >> +      if (VECTORIZABLE_CYCLE_DEF (dt))
> >> +       return false;
> >> +
> >> +      /* To properly compute ncopies we are interested in the widest
> >> +        non-reduction input type in case we're looking at a widening
> >> +        accumulation that we later handle in vect transformation.  */
> >> +      if (!vectype_in
> >> +         || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> >> +             < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
> >> +       vectype_in = vectype;
> >> +    }
> >> +
> >> +  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >
> > As said below I wonder where we would need STMT_VINFO_REDUC_VECTYPE_IN.
> > At least you should avoid re-setting this when !cost_vec aka during transform,
> > possibly instead asserting you re-compute the same type (or simply
> > skip the above
> > loop and set vectype_in from STMT_VINFO_REDUC_VECTYPE_IN which then
> > gets a good use).
>
> Likewise.
>
> >
> >> +  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
> >> +
> >> +  /* TODO: Support lane-reducing operation that does not directly participate
> >> +     in loop reduction. */
> >> +  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
> >> +    return false;
> >> +
> >> +  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
> >> +     recoginized.  */
> >> +  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
> >> +  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
> >> +
> >> +  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> >> +
> >> +  /* To accommodate lane-reducing operations of mixed input vectypes, choose
> >> +     input vectype with the least lanes for the reduction PHI statement, which
> >> +     would result in the most ncopies for vectorized reduction results.  */
> >> +  if (!vphi_vectype_in
> >> +      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> >> +         > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
> >> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> >
> > Likewise.
> >
> >> +  int ncopies_for_cost;
> >> +
> >> +  if (slp_node)
> >> +    {
> >> +      /* Now lane-reducing operations in a slp node should only come from
> >> +        the same loop reduction path.  */
> >> +      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
> >> +      ncopies_for_cost = 1;
> >> +    }
> >> +  else
> >> +    {
> >> +      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
> >> +      gcc_assert (ncopies_for_cost >= 1);
> >> +    }
> >> +
> >> +  if (vect_is_emulated_mixed_dot_prod (stmt_info))
> >> +    {
> >> +      /* We need extra two invariants: one that contains the minimum signed
> >> +        value and one that contains half of its negative.  */
> >> +      int prologue_stmts = 2;
> >> +      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
> >> +                                       scalar_to_vec, stmt_info, 0,
> >> +                                       vect_prologue);
> >> +      if (dump_enabled_p ())
> >> +       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
> >> +                    "extra prologue_cost = %d .\n", cost);
> >> +
> >> +      /* Three dot-products and a subtraction.  */
> >> +      ncopies_for_cost *= 4;
> >> +    }
> >> +
> >> +  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
> >> +                   vect_body);
> >> +
> >> +  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
> >> +                                    type, vectype_in);
> >> +
> >> +  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
> >
> > Uh, so those all go through vect_transform_reduction.  I see.
> >
> > I fail to see a check for whether the target supports the lane-reducing op.
> > vectorizable_reduction only checks the last one.  Currently the check
> > might be redundant with what pattern recognition checks but it's still
> > incomplete compared to the check in vectorizable_reduction.
>
> In the original vectorizable_reduction, the target support check is deliberately
> skipped for lane-reducing operations. The reason is part as you said, moreover,
> other check would always not be executed.
>
>   if (single_defuse_cycle || lane_reduc_code_p)
>     {
>       gcc_assert (op.code != COND_EXPR);
>
>       /* 4. Supportable by target?  */
>       bool ok = true;
>
>       /* 4.1. check support for the operation in the loop
>
>          This isn't necessary for the lane reduction codes, since they
>          can only be produced by pattern matching, and it's up to the
>          pattern matcher to test for support.  The main reason for
>          specifically skipping this step is to avoid rechecking whether
>          mixed-sign dot-products can be implemented using signed
>          dot-products.  */
>       machine_mode vec_mode = TYPE_MODE (vectype_in);
>       if (!lane_reduc_code_p                              //<----------- skip
>           && !directly_supported_p (op.code, vectype_in, optab_vector))
>         {
>           if (dump_enabled_p ())
>             dump_printf (MSG_NOTE, "op not supported by target.\n");
>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
>               || !vect_can_vectorize_without_simd_p (op.code))
>             ok = false;
>           else
>             if (dump_enabled_p ())
>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
>         }
>
>       // <----- always false for lane-reducing op
>
>       if (vect_emulated_vector_p (vectype_in)
>           && !vect_can_vectorize_without_simd_p (op.code))
>         {
>           if (dump_enabled_p ())
>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
>           return false;
>         }
>
> >
> >> +  return true;
> >> +}
> >> +
> >>  /* Function vectorizable_reduction.
> >>
> >>     Check if STMT_INFO performs a reduction operation that can be vectorized.
> >> @@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>                                (gimple_bb (reduc_def_phi)->loop_father));
> >>    unsigned reduc_chain_length = 0;
> >>    bool only_slp_reduc_chain = true;
> >> +  bool only_lane_reducing = true;
> >>    stmt_info = NULL;
> >>    slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
> >>    while (reduc_def != PHI_RESULT (reduc_def_phi))
> >> @@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>               return false;
> >>             }
> >>         }
> >> -      else if (!stmt_info)
> >> -       /* First non-conversion stmt.  */
> >> -       stmt_info = vdef;
> >> +      else
> >> +       {
> >> +         /* First non-conversion stmt.  */
> >> +         if (!stmt_info)
> >> +           stmt_info = vdef;
> >> +
> >> +         if (!lane_reducing_op_p (op.code))
> >> +           only_lane_reducing = false;
> >> +       }
> >> +
> >>        reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> >>        reduc_chain_length++;
> >>        if (!stmt_info && slp_node)
> >> @@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>    if (!type_has_mode_precision_p (op.type))
> >>      return false;
> >>
> >> -  /* For lane-reducing ops we're reducing the number of reduction PHIs
> >> -     which means the only use of that may be in the lane-reducing operation.  */
> >> -  if (lane_reducing
> >> -      && reduc_chain_length != 1
> >> -      && !only_slp_reduc_chain)
> >> -    {
> >> -      if (dump_enabled_p ())
> >> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> -                        "lane-reducing reduction with extra stmts.\n");
> >> -      return false;
> >> -    }
> >> -
> >>    /* Lane-reducing ops also never can be used in a SLP reduction group
> >>       since we'll mix lanes belonging to different reductions.  But it's
> >>       OK to use them in a reduction chain or when the reduction group
> >> @@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>                              "use not simple.\n");
> >>           return false;
> >>         }
> >> -      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> >> -       continue;
> >> -
> >
> > So within this loop we analyze the "main" operation, while I do not exactly
> > remember why we skip the op leading to the PHI I don't understand why you
> > want to look at it for the multi lane-reducing case (the accumulator
> > always has the same type, no?).
> >
> > In any case this just looks at a single (the last) lane-reducing or even
> > not lane-reducing op.
> >
>
> This comparison is redundant, since it could be covered by the following
> comparison statement. The change should have been placed to a separate
> patch, but for convenience I made it here.
>
>       /* For an IFN_COND_OP we might hit the reduction definition operand
>          twice (once as definition, once as else).  */
>       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
>         continue;
>
>       /* There should be only one cycle def in the stmt, the one
>          leading to reduc_def.  */
>       if (VECTORIZABLE_CYCLE_DEF (dt))
>         return false;
>
> >>        /* For an IFN_COND_OP we might hit the reduction definition operand
> >>          twice (once as definition, once as else).  */
> >>        if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> >> @@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>      }
> >>    if (!vectype_in)
> >>      vectype_in = STMT_VINFO_VECTYPE (phi_info);
> >> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> >>
> >> -  /* Each lane-reducing operation has its own input vectype, while reduction
> >> -     PHI records the input vectype with least lanes.  */
> >> -  if (lane_reducing)
> >> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >> -
> >> -  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
> >> -  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
> >> +  /* If there is a normal (non-lane-reducing) operation in the loop reduction
> >> +     path, to ensure there will be enough copies to hold vectorized results of
> >> +     the operation, we need set the input vectype of the reduction PHI to be
> >> +     same as the reduction output vectype somewhere, here is a suitable place.
> >> +     Otherwise the input vectype is set to the one with the least lanes, which
> >> +     can only be determined in vectorizable analysis routine of lane-reducing
> >> +     operation.  */
> >
> > But we are using vectype_in to compute ncopies which is used in cost analysis.
>
> The vectype_in only impacts the cost analysis for lane-reducing op, since the
> function vect_is_emulated_mixed_dot_prod need it, and this function is referred
> by cost analysis. In the previous patch, we bind the vectype_in to each
> lane-reducing op and also adjust code of the function accordingly, then this
> would not be a problem.
>
> > You say this might not be the final ncopies?  Note the vectorization factor is
> > already fixed as well as (output) vector types of the lane-reducing ops.  So
>
> The vectype_in is incrementally updated during analyzing vectorizablility of
> lane-reducing ops. So before transform, the type should be determined.
>
> > shouldn't we simply pick that up in the loop walking the use-def chain via
> > REDUC_IDX at the start of this function?
>
> I thought about doing it in that way. Ok. will consider it again.
>
> > I'm unsure as to why we need
> > STMT_VINFO_REDUC_VECTYPE_IN at all (I don't remember adding that),
> > it should be readily available from operand analysis.  The docs for that
> > isn't very enlightening either (there's also REDUC_VECTYPE, in addition
> > to VECTYPE - huh).
>
> For old code, in which only one lane-reducing op is allowed in loop
> reduction, this type might be computed on-demand.
>
> But for multiple lane-reducing ops, we need to know the vectype_in types
> of all ops in order to determine a proper vectype_in for PHI statement, if
> traversing those ops and computing types on-demand would not a good
> way.  Additionally, during transform, originally cfg flow is broken and could
> not be used.
>
> >> +  if (!only_lane_reducing)
> >> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
> >> +
> >> +  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
> >> +  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> >>    /* If we have a condition reduction, see if we can simplify it further.  */
> >> -  if (v_reduc_type == COND_REDUCTION)
> >> +  if (reduction_type == COND_REDUCTION)
> >>      {
> >>        if (slp_node)
> >>         return false;
> >> @@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>      }
> >>
> >>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
> >> +  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>
> >> -  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>    if (reduction_type == TREE_CODE_REDUCTION)
> >>      {
> >>        /* Check whether it's ok to change the order of the computation.
> >> @@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>        && loop_vinfo->suggested_unroll_factor == 1)
> >>      single_defuse_cycle = true;
> >>
> >> -  if (single_defuse_cycle || lane_reducing)
> >> +  if (single_defuse_cycle && !lane_reducing)
> >>      {
> >>        gcc_assert (op.code != COND_EXPR);
> >>
> >> -      /* 4. Supportable by target?  */
> >> -      bool ok = true;
> >> -
> >> -      /* 4.1. check support for the operation in the loop
> >> +      /* 4. check support for the operation in the loop
> >>
> >>          This isn't necessary for the lane reduction codes, since they
> >>          can only be produced by pattern matching, and it's up to the
> >> @@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>          mixed-sign dot-products can be implemented using signed
> >>          dot-products.  */
> >>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> >> -      if (!lane_reducing
> >> -         && !directly_supported_p (op.code, vectype_in, optab_vector))
> >> +      if (!directly_supported_p (op.code, vectype_in, optab_vector))
> >>          {
> >>            if (dump_enabled_p ())
> >>              dump_printf (MSG_NOTE, "op not supported by target.\n");
> >>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
> >>               || !vect_can_vectorize_without_simd_p (op.code))
> >> -           ok = false;
> >> +           single_defuse_cycle = false;
> >>           else
> >>             if (dump_enabled_p ())
> >>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
> >> @@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
> >>           return false;
> >>         }
> >> -
> >> -      /* lane-reducing operations have to go through vect_transform_reduction.
> >> -         For the other cases try without the single cycle optimization.  */
> >> -      if (!ok)
> >> -       {
> >> -         if (lane_reducing)
> >> -           return false;
> >> -         else
> >> -           single_defuse_cycle = false;
> >> -       }
> >>      }
> >>    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
> >>
> >> -  /* If the reduction stmt is one of the patterns that have lane
> >> -     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
> >> -  if ((ncopies > 1 && ! single_defuse_cycle)
> >> -      && lane_reducing)
> >> -    {
> >> -      if (dump_enabled_p ())
> >> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> -                        "multi def-use cycle not possible for lane-reducing "
> >> -                        "reduction operation\n");
> >> -      return false;
> >> -    }
> >> -
> >> -  if (slp_node
> >> -      && !(!single_defuse_cycle
> >> -          && !lane_reducing
> >> -          && reduction_type != FOLD_LEFT_REDUCTION))
> >> +  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
> >> +     below processing will be done in its own vectorizable function.  */
> >> +  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
> >>      for (i = 0; i < (int) op.num_ops; i++)
> >>        if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
> >>         {
> >> @@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>    vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
> >>                              reduction_type, ncopies, cost_vec);
> >>    /* Cost the reduction op inside the loop if transformed via
> >> -     vect_transform_reduction.  Otherwise this is costed by the
> >> -     separate vectorizable_* routines.  */
> >> -  if (single_defuse_cycle || lane_reducing)
> >> -    {
> >> -      int factor = 1;
> >> -      if (vect_is_emulated_mixed_dot_prod (stmt_info))
> >> -       /* Three dot-products and a subtraction.  */
> >> -       factor = 4;
> >> -      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
> >> -                       stmt_info, 0, vect_body);
> >> -    }
> >> +     vect_transform_reduction for non-lane-reducing operation.  Otherwise
> >> +     this is costed by the separate vectorizable_* routines.  */
> >> +  if (single_defuse_cycle && !lane_reducing)
> >> +    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
> >>
> >>    if (dump_enabled_p ()
> >>        && reduction_type == FOLD_LEFT_REDUCTION)
> >>      dump_printf_loc (MSG_NOTE, vect_location,
> >>                      "using an in-order (fold-left) reduction.\n");
> >>    STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
> >> -  /* All but single defuse-cycle optimized, lane-reducing and fold-left
> >> -     reductions go through their own vectorizable_* routines.  */
> >> -  if (!single_defuse_cycle
> >> -      && !lane_reducing
> >> -      && reduction_type != FOLD_LEFT_REDUCTION)
> >> +
> >> +  /* All but single defuse-cycle optimized and fold-left reductions go
> >> +     through their own vectorizable_* routines.  */
> >> +  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
> >> +      || lane_reducing)
>
> >
> > So single-def-use-cycle but lane-reducing ops no longer need
> > to go through vect_transform_reduction?  How do you handle those
> > but fail to handle non-lane-reducing ops this way?
>
> Emm, all kinds of lane-reducing ops will go into vectorizable_lane_reducing(),
> no matter it is single-def-use or not, at that function, the STMT_VINFO_TYPE
> is set to reduc_vec_info_type, so transform will be done inside
> vect_transform_reduction.
>
> >
> >>      {
> >>        stmt_vec_info tem
> >>         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
> >> @@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> >>    int i;
> >>    int ncopies;
> >> +  int stmt_ncopies;
> >>    int vec_num;
> >>
> >>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> >> @@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
> >>    int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
> >>    tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> >> +  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
> >> +
> >> +  /* Get input vectypes from the reduction PHI and the statement to be
> >> +     transformed, these two vectypes may have different lanes when
> >> +     lane-reducing operation is present.  */
> >> +  if (!vectype_in)
> >> +    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
> >> +
> >> +  if (!stmt_vectype_in)
> >> +    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
> >>
> >>    if (slp_node)
> >>      {
> >>        ncopies = 1;
> >> +      stmt_ncopies = 1;
> >>        vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> >>      }
> >>    else
> >>      {
> >>        ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> >> +      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
> >> +      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
> >>        vec_num = 1;
> >>      }
> >>
> >> @@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>
> >>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> >>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> >> -  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> >> -
> >> +  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> >> +                                                   stmt_vectype_in);
> >>    /* Transform.  */
> >> -  tree new_temp = NULL_TREE;
> >> -  auto_vec<tree> vec_oprnds0;
> >> -  auto_vec<tree> vec_oprnds1;
> >> -  auto_vec<tree> vec_oprnds2;
> >> -  tree def0;
> >> +  auto_vec<tree> vec_oprnds[3];
> >>
> >>    if (dump_enabled_p ())
> >>      dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
> >> @@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
> >>      }
> >>
> >> -  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >> -
> >>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>    if (reduction_type == FOLD_LEFT_REDUCTION)
> >>      {
> >> @@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>        gcc_assert (code.is_tree_code () || cond_fn_p);
> >>        return vectorize_fold_left_reduction
> >>           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> >> -          code, reduc_fn, op.ops, op.num_ops, vectype_in,
> >> +          code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
> >>            reduc_index, masks, lens);
> >>      }
> >>
> >> @@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
> >>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
> >>
> >> -  /* Get NCOPIES vector definitions for all operands except the reduction
> >> -     definition.  */
> >> -  if (!cond_fn_p)
> >> +  gcc_assert (reduc_index < 3);
> >> +
> >> +  if (slp_node)
> >>      {
> >> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> >> -                        single_defuse_cycle && reduc_index == 0
> >> -                        ? NULL_TREE : op.ops[0], &vec_oprnds0,
> >> -                        single_defuse_cycle && reduc_index == 1
> >> -                        ? NULL_TREE : op.ops[1], &vec_oprnds1,
> >> -                        op.num_ops == 3
> >> -                        && !(single_defuse_cycle && reduc_index == 2)
> >> -                        ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> >> +      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
> >
> > I think that's going to fail.  Mind v3 of the series I posted to enable
> > SLP discovery for single-lane reductions.  Basically everything is
> > going to be SLP for GCC 15.
> >
>
> Have the v3 already been in the trunk? Then by default, any statement that has
> no isomorphic partner will become a single-lane SLP node?  And for such node,
> can I just reuse the old non-SLP transformation code?

As of this morning, r15-1006-gd93353e6423eca, it is on trunk.  Note the fallback
is still non-SLP in case vectorizable_reduction FAILs with SLP.  I have a set of
changes queued to allow some more kind of reductions with SLP but IIRC the
lane-reducing variant is already supported.

Richard.

> >> +
> >> +      for (i = 0; i < (int) op.num_ops; i++)
> >> +       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
> >>      }
> >>    else
> >>      {
> >> -      /* For a conditional operation pass the truth type as mask
> >> -        vectype.  */
> >> -      gcc_assert (single_defuse_cycle
> >> -                 && (reduc_index == 1 || reduc_index == 2));
> >> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> >> -                        op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
> >> -                        reduc_index == 1 ? NULL_TREE : op.ops[1],
> >> -                        NULL_TREE, &vec_oprnds1,
> >> -                        reduc_index == 2 ? NULL_TREE : op.ops[2],
> >> -                        NULL_TREE, &vec_oprnds2);
> >> -    }
> >> +      /* The input vectype of the reduction PHI determines copies of
> >> +        vectorized def-use cycles, which might be more than effective copies
> >> +        of vectorized lane-reducing reduction statements.  This could be
> >> +        complemented by generating extra trivial pass-through copies.  For
> >> +        example:
> >> +
> >
> > That also means you need to handle SLP here, but you can assert there's
> > only a single lane.
> >
> > Btw, you can push the patches I approved if they independently test OK.
> >
>
> >> +          int sum = 0;
> >> +          for (i)
> >> +            {
> >> +              sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> >> +              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
> >> +              sum += n[i];               // normal <vector(4) int>
> >> +            }
> >> +
> >> +        The vector size is 128-bit?vectorization factor is 16.  Reduction
> >> +        statements would be transformed as:
> >> +
> >> +          vector<4> int sum_v0 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v1 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v2 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v3 = { 0, 0, 0, 0 };
> >> +
> >> +          for (i / 16)
> >> +            {
> >> +              sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> >> +              sum_v1 = sum_v1;  // copy
> >> +              sum_v2 = sum_v2;  // copy
> >> +              sum_v3 = sum_v3;  // copy
> >> +
> >> +              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> >> +              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> >> +              sum_v2 = sum_v2;  // copy
> >> +              sum_v3 = sum_v3;  // copy
> >> +
> >> +              sum_v0 += n_v0[i: 0  ~ 3 ];
> >> +              sum_v1 += n_v1[i: 4  ~ 7 ];
> >> +              sum_v2 += n_v2[i: 8  ~ 11];
> >> +              sum_v3 += n_v3[i: 12 ~ 15];
> >> +            }
> >> +       */
> >> +
> >> +      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
> >> +       {
> >> +         tree vectype = NULL_TREE;
> >> +         int used_ncopies = ncopies;
> >> +
> >> +         if (cond_fn_p && i == 0)
> >> +           {
> >> +             /* For a conditional operation pass the truth type as mask
> >> +                vectype.  */
> >> +             gcc_assert (single_defuse_cycle && reduc_index > 0);
> >> +             vectype = truth_type_for (vectype_in);
> >> +           }
> >>
> >> -  /* For single def-use cycles get one copy of the vectorized reduction
> >> -     definition.  */
> >> -  if (single_defuse_cycle)
> >> -    {
> >> -      gcc_assert (!slp_node);
> >> -      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> >> -                                    op.ops[reduc_index],
> >> -                                    reduc_index == 0 ? &vec_oprnds0
> >> -                                    : (reduc_index == 1 ? &vec_oprnds1
> >> -                                       : &vec_oprnds2));
> >> +         if (i != reduc_index)
> >> +           {
> >> +             /* For non-reduction operand, deduce effictive copies that are
> >> +                involved in vectorized def-use cycles based on the input
> >> +                vectype of the reduction statement.  */
> >> +             used_ncopies = stmt_ncopies;
> >> +           }
> >> +         else if (single_defuse_cycle)
> >> +           {
> >> +             /* For single def-use cycles get one copy of the vectorized
> >> +                reduction definition.  */
> >> +             used_ncopies = 1;
> >> +           }
> >> +
> >> +         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
> >> +                                        op.ops[i], &vec_oprnds[i], vectype);
> >> +
> >> +         if (used_ncopies < ncopies)
> >> +           vec_oprnds[i].safe_grow_cleared (ncopies);
> >> +       }
> >>      }
> >>
> >> +  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >>    bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> >> +  tree def0;
> >>
> >> -  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
> >> +  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
> >>      {
> >>        gimple *new_stmt;
> >> -      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
> >> -      if (masked_loop_p && !mask_by_cond_expr)
> >> +      tree new_temp = NULL_TREE;
> >> +      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
> >> +
> >> +      if (!vop[0] || !vop[1])
> >> +       {
> >> +         tree reduc_vop = vec_oprnds[reduc_index][i];
> >> +
> >> +         /* Insert trivial copy if no need to generate vectorized
> >> +            statement.  */
> >> +         gcc_assert (reduc_vop && stmt_ncopies < ncopies);
> >> +
> >> +         new_stmt = gimple_build_assign (vec_dest, reduc_vop);
> >> +         new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +         gimple_set_lhs (new_stmt, new_temp);
> >> +         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> >> +       }
> >> +      else if (masked_loop_p && !mask_by_cond_expr)
> >>         {
> >> -         /* No conditional ifns have been defined for dot-product yet.  */
> >> -         gcc_assert (code != DOT_PROD_EXPR);
> >> +         /* No conditional ifns have been defined for dot-product and sad
> >> +            yet.  */
> >> +         gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
> >>
> >>           /* Make sure that the reduction accumulator is vop[0].  */
> >>           if (reduc_index == 1)
> >> @@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>               std::swap (vop[0], vop[1]);
> >>             }
> >>           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> >> -                                         vec_num * ncopies, vectype_in, i);
> >> +                                         vec_num * stmt_ncopies,
> >> +                                         stmt_vectype_in, i);
> >>           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
> >>                                                     vop[0], vop[1], vop[0]);
> >>           new_temp = make_ssa_name (vec_dest, call);
> >> @@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>        else
> >>         {
> >>           if (op.num_ops >= 3)
> >> -           vop[2] = vec_oprnds2[i];
> >> +           vop[2] = vec_oprnds[2][i];
> >>
> >>           if (masked_loop_p && mask_by_cond_expr)
> >>             {
> >>               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> >> -                                             vec_num * ncopies, vectype_in, i);
> >> +                                             vec_num * stmt_ncopies,
> >> +                                             stmt_vectype_in, i);
> >>               build_vect_cond_expr (code, vop, mask, gsi);
> >>             }
> >>
> >> @@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>
> >>        if (slp_node)
> >>         slp_node->push_vec_def (new_stmt);
> >> -      else if (single_defuse_cycle
> >> -              && i < ncopies - 1)
> >> -       {
> >> -         if (reduc_index == 0)
> >> -           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
> >> -         else if (reduc_index == 1)
> >> -           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
> >> -         else if (reduc_index == 2)
> >> -           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
> >> -       }
> >> +      else if (single_defuse_cycle && i < ncopies - 1)
> >> +       vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
> >>        else
> >>         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >>      }
> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> >> index 2e0be763abb..cc0a832f71b 100644
> >> --- a/gcc/tree-vect-stmts.cc
> >> +++ b/gcc/tree-vect-stmts.cc
> >> @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo,
> >>                                       NULL, NULL, node, cost_vec)
> >>           || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> >>           || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> >> +         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
> >> +                                        stmt_info, node, cost_vec)
> >>           || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
> >>                                      node, node_instance, cost_vec)
> >>           || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
> >> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> >> index 97ec9c341e7..ca810869592 100644
> >> --- a/gcc/tree-vectorizer.h
> >> +++ b/gcc/tree-vectorizer.h
> >> @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
> >>  extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
> >>                                          slp_tree, slp_instance, int,
> >>                                          bool, stmt_vector_for_cost *);
> >> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
> >> +                                       slp_tree, stmt_vector_for_cost *);
> >>  extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
> >>                                     slp_tree, slp_instance,
> >>                                     stmt_vector_for_cost *);
> >> --
> >> 2.17.1