[PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]
@ 2024-05-30 14:54 Feng Xue OS
  2024-05-31 14:57 ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Feng Xue OS @ 2024-05-30 14:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: Tamar Christina, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 45080 bytes --]

For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
vectorizer could only handle the pattern if the reduction chain does not
contain other operation, no matter the other is normal or lane-reducing.

Actually, to allow multiple arbitray lane-reducing operations, we need to
support vectorization of loop reduction chain with mixed input vectypes. Since
lanes of vectype may vary with operation, the effective ncopies of vectorized
statements for operation also may not be same to each other, this causes
mismatch on vectorized def-use cycles. A simple way is to align all operations
with the one that has the most ncopies, the gap could be complemented by
generating extra trival pass-through copies. For example:

   int sum = 0;
   for (i)
     {
       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
       sum += w[i];               // widen-sum <vector(16) char>
       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
       sum += n[i];               // normal <vector(4) int>
     }

The vector size is 128-bit，vectorization factor is 16. Reduction statements
would be transformed as:

   vector<4> int sum_v0 = { 0, 0, 0, 0 };
   vector<4> int sum_v1 = { 0, 0, 0, 0 };
   vector<4> int sum_v2 = { 0, 0, 0, 0 };
   vector<4> int sum_v3 = { 0, 0, 0, 0 };

   for (i / 16)
     {
       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 += n_v0[i: 0  ~ 3 ];
       sum_v1 += n_v1[i: 4  ~ 7 ];
       sum_v2 += n_v2[i: 8  ~ 11];
       sum_v3 += n_v3[i: 12 ~ 15];
     }

Thanks,
Feng
---
gcc/
	PR tree-optimization/114440
	* tree-vectorizer.h (vectorizable_lane_reducing): New function
	declaration.
	* tree-vect-stmts.cc (vect_analyze_stmt): Call new function
	vectorizable_lane_reducing to analyze lane-reducing operation.
	* tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
	code related to	emulated_mixed_dot_prod.
	(vectorizable_lane_reducing): New function.
	(vectorizable_reduction): Allow multiple lane-reducing operations in
	loop reduction. Move some original lane-reducing related code to
	vectorizable_lane_reducing.
	(vect_transform_reduction): Extend transformation to support reduction
	statements with mixed input vectypes.

gcc/testsuite/
	PR tree-optimization/114440
	* gcc.dg/vect/vect-reduc-chain-1.c
	* gcc.dg/vect/vect-reduc-chain-2.c
	* gcc.dg/vect/vect-reduc-chain-3.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
	* gcc.dg/vect/vect-reduc-dot-slp-1.c
---
 .../gcc.dg/vect/vect-reduc-chain-1.c          |  62 +++
 .../gcc.dg/vect/vect-reduc-chain-2.c          |  77 +++
 .../gcc.dg/vect/vect-reduc-chain-3.c          |  66 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  97 ++++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  81 +++
 .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  35 ++
 gcc/tree-vect-loop.cc                         | 478 ++++++++++++------
 gcc/tree-vect-stmts.cc                        |   2 +
 gcc/tree-vectorizer.h                         |   2 +
 9 files changed, 755 insertions(+), 145 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
new file mode 100644
index 00000000000..04bfc419dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_2 char *restrict c,
+   SIGNEDNESS_2 char *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_2 char c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = BASE + i * 2;
+      d[i] = BASE + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
new file mode 100644
index 00000000000..6c803b80120
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
@@ -0,0 +1,77 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 char *restrict c,
+   SIGNEDNESS_3 char *restrict d,
+   SIGNEDNESS_4 short *restrict e,
+   SIGNEDNESS_4 short *restrict f,
+   SIGNEDNESS_1 int *restrict g)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += i + 1;
+      res += c[i] * d[i];
+      res += e[i] * f[i];
+      res += g[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 char c[N], d[N];
+  SIGNEDNESS_4 short e[N], f[N];
+  SIGNEDNESS_1 int g[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += i + 1;
+      expected += c[i] * d[i];
+      expected += e[i] * f[i];
+      expected += g[i];
+    }
+  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
new file mode 100644
index 00000000000..a41e4b176c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
@@ -0,0 +1,66 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 short *restrict c,
+   SIGNEDNESS_3 short *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      res += abs;
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 short c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 - i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      expected += abs;
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
new file mode 100644
index 00000000000..51ef4eaaed8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
@@ -0,0 +1,97 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+      res += a[8] * b[8];
+      res += a[9] * b[9];
+      res += a[10] * b[10];
+      res += a[11] * b[11];
+      res += a[12] * b[12];
+      res += a[13] * b[13];
+      res += a[14] * b[14];
+      res += a[15] * b[15];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int step = 16;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      expected += a[t + 8] * b[t + 8];
+      expected += a[t + 9] * b[t + 9];
+      expected += a[t + 10] * b[t + 10];
+      expected += a[t + 11] * b[t + 11];
+      expected += a[t + 12] * b[t + 12];
+      expected += a[t + 13] * b[t + 13];
+      expected += a[t + 14] * b[t + 14];
+      expected += a[t + 15] * b[t + 15];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
new file mode 100644
index 00000000000..1532833c3ae
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
@@ -0,0 +1,81 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int step = 8;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
new file mode 100644
index 00000000000..e17d6291f75
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
@@ -0,0 +1,35 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res0,
+   SIGNEDNESS_1 int res1,
+   SIGNEDNESS_1 int res2,
+   SIGNEDNESS_1 int res3,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b)
+{
+  for (int i = 0; i < 64; i += 4)
+    {
+      res0 += a[i + 0] * b[i + 0];
+      res1 += a[i + 1] * b[i + 1];
+      res2 += a[i + 2] * b[i + 2];
+      res3 += a[i + 3] * b[i + 3];
+    }
+
+  return res0 ^ res1 ^ res2 ^ res3;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 20c99f11e9a..b5849dbb08a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
-  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 	   initial result of the data reduction, initial value of the index
 	   reduction.  */
 	prologue_stmts = 4;
-      else if (emulated_mixed_dot_prod)
-	/* We need the initial reduction value and two invariants:
-	   one that contains the minimum signed value and one that
-	   contains half of its negative.  */
-	prologue_stmts = 3;
       else
+	/* We need the initial reduction value.  */
 	prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
 					 scalar_to_vec, stmt_info, 0,
@@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
     }
 }
 
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
+   Now there are three such kinds of operations: dot-prod/widen-sum/sad
+   (sum-of-absolute-differences).
+
+   For a lane-reducing operation, the loop reduction path that it lies in,
+   may contain normal operation, or other lane-reducing operation of different
+   input type size, an example as:
+
+     int sum = 0;
+     for (i)
+       {
+         ...
+         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
+         sum += w[i];                // widen-sum <vector(16) char>
+         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
+         sum += n[i];                // normal <vector(4) int>
+         ...
+       }
+
+   Vectorization factor is essentially determined by operation whose input
+   vectype has the most lanes ("vector(16) char" in the example), while we
+   need to choose input vectype with the least lanes ("vector(4) int" in the
+   example) for the reduction PHI statement.  */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+			    slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!stmt)
+    return false;
+
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+
+  if (!lane_reducing_op_p (code))
+    return false;
+
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
+    return false;
+
+  /* Do not try to vectorize bit-precision reductions.  */
+  if (!type_has_mode_precision_p (type))
+    return false;
+
+  tree vectype_in = NULL_TREE;
+
+  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+    {
+      stmt_vec_info def_stmt_info;
+      slp_tree slp_op;
+      tree op;
+      tree vectype;
+      enum vect_def_type dt;
+
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+			       &slp_op, &dt, &vectype, &def_stmt_info))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "use not simple.\n");
+	  return false;
+	}
+
+      if (!vectype)
+	{
+	  vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+						 slp_op);
+	  if (!vectype)
+	    return false;
+	}
+
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "incompatible vector types for invariants\n");
+	  return false;
+	}
+
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+	continue;
+
+      /* There should be at most one cycle def in the stmt.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
+	return false;
+
+      /* To properly compute ncopies we are interested in the widest
+	 non-reduction input type in case we're looking at a widening
+	 accumulation that we later handle in vect transformation.  */
+      if (!vectype_in
+	  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+	      < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
+	vectype_in = vectype;
+    }
+
+  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
+
+  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+  /* TODO: Support lane-reducing operation that does not directly participate
+     in loop reduction. */
+  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+    return false;
+
+  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+     recoginized.  */
+  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+
+  /* To accommodate lane-reducing operations of mixed input vectypes, choose
+     input vectype with the least lanes for the reduction PHI statement, which
+     would result in the most ncopies for vectorized reduction results.  */
+  if (!vphi_vectype_in
+      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+	  > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
+    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
+  int ncopies_for_cost;
+
+  if (slp_node)
+    {
+      /* Now lane-reducing operations in a slp node should only come from
+	 the same loop reduction path.  */
+      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
+      ncopies_for_cost = 1;
+    }
+  else
+    {
+      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
+      gcc_assert (ncopies_for_cost >= 1);
+    }
+
+  if (vect_is_emulated_mixed_dot_prod (stmt_info))
+    {
+      /* We need extra two invariants: one that contains the minimum signed
+	 value and one that contains half of its negative.  */
+      int prologue_stmts = 2;
+      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+					scalar_to_vec, stmt_info, 0,
+					vect_prologue);
+      if (dump_enabled_p ())
+	dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+		     "extra prologue_cost = %d .\n", cost);
+
+      /* Three dot-products and a subtraction.  */
+      ncopies_for_cost *= 4;
+    }
+
+  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
+		    vect_body);
+
+  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
+				     type, vectype_in);
+
+  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+  return true;
+}
+
 /* Function vectorizable_reduction.
 
    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			       (gimple_bb (reduc_def_phi)->loop_father));
   unsigned reduc_chain_length = 0;
   bool only_slp_reduc_chain = true;
+  bool only_lane_reducing = true;
   stmt_info = NULL;
   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
   while (reduc_def != PHI_RESULT (reduc_def_phi))
@@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	      return false;
 	    }
 	}
-      else if (!stmt_info)
-	/* First non-conversion stmt.  */
-	stmt_info = vdef;
+      else
+	{
+	  /* First non-conversion stmt.  */
+	  if (!stmt_info)
+	    stmt_info = vdef;
+
+	  if (!lane_reducing_op_p (op.code))
+	    only_lane_reducing = false;
+	}
+
       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
@@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (!type_has_mode_precision_p (op.type))
     return false;
 
-  /* For lane-reducing ops we're reducing the number of reduction PHIs
-     which means the only use of that may be in the lane-reducing operation.  */
-  if (lane_reducing
-      && reduc_chain_length != 1
-      && !only_slp_reduc_chain)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "lane-reducing reduction with extra stmts.\n");
-      return false;
-    }
-
   /* Lane-reducing ops also never can be used in a SLP reduction group
      since we'll mix lanes belonging to different reductions.  But it's
      OK to use them in a reduction chain or when the reduction group
@@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			     "use not simple.\n");
 	  return false;
 	}
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-	continue;
-
       /* For an IFN_COND_OP we might hit the reduction definition operand
 	 twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
@@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
   if (!vectype_in)
     vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
 
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
-
-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  /* If there is a normal (non-lane-reducing) operation in the loop reduction
+     path, to ensure there will be enough copies to hold vectorized results of
+     the operation, we need set the input vectype of the reduction PHI to be
+     same as the reduction output vectype somewhere, here is a suitable place.
+     Otherwise the input vectype is set to the one with the least lanes, which
+     can only be determined in vectorizable analysis routine of lane-reducing
+     operation.  */
+  if (!only_lane_reducing)
+    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
+
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node)
 	return false;
@@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
 
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
 
-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.
@@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;
 
-  if (single_defuse_cycle || lane_reducing)
+  if (single_defuse_cycle && !lane_reducing)
     {
       gcc_assert (op.code != COND_EXPR);
 
-      /* 4. Supportable by target?  */
-      bool ok = true;
-
-      /* 4.1. check support for the operation in the loop
+      /* 4. check support for the operation in the loop
 
 	 This isn't necessary for the lane reduction codes, since they
 	 can only be produced by pattern matching, and it's up to the
@@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	 mixed-sign dot-products can be implemented using signed
 	 dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!lane_reducing
-	  && !directly_supported_p (op.code, vectype_in, optab_vector))
+      if (!directly_supported_p (op.code, vectype_in, optab_vector))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
 	      || !vect_can_vectorize_without_simd_p (op.code))
-	    ok = false;
+	    single_defuse_cycle = false;
 	  else
 	    if (dump_enabled_p ())
 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
 	  return false;
 	}
-
-      /* lane-reducing operations have to go through vect_transform_reduction.
-         For the other cases try without the single cycle optimization.  */
-      if (!ok)
-	{
-	  if (lane_reducing)
-	    return false;
-	  else
-	    single_defuse_cycle = false;
-	}
     }
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
 
-  /* If the reduction stmt is one of the patterns that have lane
-     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1 && ! single_defuse_cycle)
-      && lane_reducing)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "multi def-use cycle not possible for lane-reducing "
-			 "reduction operation\n");
-      return false;
-    }
-
-  if (slp_node
-      && !(!single_defuse_cycle
-	   && !lane_reducing
-	   && reduction_type != FOLD_LEFT_REDUCTION))
+  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
+     below processing will be done in its own vectorizable function.  */
+  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
     for (i = 0; i < (int) op.num_ops; i++)
       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
 	{
@@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
 			     reduction_type, ncopies, cost_vec);
   /* Cost the reduction op inside the loop if transformed via
-     vect_transform_reduction.  Otherwise this is costed by the
-     separate vectorizable_* routines.  */
-  if (single_defuse_cycle || lane_reducing)
-    {
-      int factor = 1;
-      if (vect_is_emulated_mixed_dot_prod (stmt_info))
-	/* Three dot-products and a subtraction.  */
-	factor = 4;
-      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
-			stmt_info, 0, vect_body);
-    }
+     vect_transform_reduction for non-lane-reducing operation.  Otherwise
+     this is costed by the separate vectorizable_* routines.  */
+  if (single_defuse_cycle && !lane_reducing)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "using an in-order (fold-left) reduction.\n");
   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
-  /* All but single defuse-cycle optimized, lane-reducing and fold-left
-     reductions go through their own vectorizable_* routines.  */
-  if (!single_defuse_cycle
-      && !lane_reducing
-      && reduction_type != FOLD_LEFT_REDUCTION)
+
+  /* All but single defuse-cycle optimized and fold-left reductions go
+     through their own vectorizable_* routines.  */
+  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
+      || lane_reducing)
     {
       stmt_vec_info tem
 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
+  int stmt_ncopies;
   int vec_num;
 
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+
+  /* Get input vectypes from the reduction PHI and the statement to be
+     transformed, these two vectypes may have different lanes when
+     lane-reducing operation is present.  */
+  if (!vectype_in)
+    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
+
+  if (!stmt_vectype_in)
+    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
 
   if (slp_node)
     {
       ncopies = 1;
+      stmt_ncopies = 1;
       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
     }
   else
     {
       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
+      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
       vec_num = 1;
     }
 
@@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
-  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
-
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
+						    stmt_vectype_in);
   /* Transform.  */
-  tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
-  tree def0;
+  auto_vec<tree> vec_oprnds[3];
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 		      == op.ops[internal_fn_else_index ((internal_fn) code)]));
     }
 
-  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
-
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
 	   reduc_index, masks, lens);
     }
 
@@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
-  /* Get NCOPIES vector definitions for all operands except the reduction
-     definition.  */
-  if (!cond_fn_p)
+  gcc_assert (reduc_index < 3);
+
+  if (slp_node)
     {
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 single_defuse_cycle && reduc_index == 0
-			 ? NULL_TREE : op.ops[0], &vec_oprnds0,
-			 single_defuse_cycle && reduc_index == 1
-			 ? NULL_TREE : op.ops[1], &vec_oprnds1,
-			 op.num_ops == 3
-			 && !(single_defuse_cycle && reduc_index == 2)
-			 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
+
+      for (i = 0; i < (int) op.num_ops; i++)
+	vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
     }
   else
     {
-      /* For a conditional operation pass the truth type as mask
-	 vectype.  */
-      gcc_assert (single_defuse_cycle
-		  && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
-			 reduc_index == 1 ? NULL_TREE : op.ops[1],
-			 NULL_TREE, &vec_oprnds1,
-			 reduc_index == 2 ? NULL_TREE : op.ops[2],
-			 NULL_TREE, &vec_oprnds2);
-    }
+      /* The input vectype of the reduction PHI determines copies of
+	 vectorized def-use cycles, which might be more than effective copies
+	 of vectorized lane-reducing reduction statements.  This could be
+	 complemented by generating extra trivial pass-through copies.  For
+	 example:
+
+	   int sum = 0;
+	   for (i)
+	     {
+	       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
+	       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+	       sum += n[i];               // normal <vector(4) int>
+	     }
+
+	 The vector size is 128-bit，vectorization factor is 16.  Reduction
+	 statements would be transformed as:
+
+	   vector<4> int sum_v0 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v1 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v2 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v3 = { 0, 0, 0, 0 };
+
+	   for (i / 16)
+	     {
+	       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
+	       sum_v1 = sum_v1;  // copy
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+	       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 += n_v0[i: 0  ~ 3 ];
+	       sum_v1 += n_v1[i: 4  ~ 7 ];
+	       sum_v2 += n_v2[i: 8  ~ 11];
+	       sum_v3 += n_v3[i: 12 ~ 15];
+	     }
+	*/
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+	{
+	  tree vectype = NULL_TREE;
+	  int used_ncopies = ncopies;
+
+	  if (cond_fn_p && i == 0)
+	    {
+	      /* For a conditional operation pass the truth type as mask
+		 vectype.  */
+	      gcc_assert (single_defuse_cycle && reduc_index > 0);
+	      vectype = truth_type_for (vectype_in);
+	    }
 
-  /* For single def-use cycles get one copy of the vectorized reduction
-     definition.  */
-  if (single_defuse_cycle)
-    {
-      gcc_assert (!slp_node);
-      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-				     op.ops[reduc_index],
-				     reduc_index == 0 ? &vec_oprnds0
-				     : (reduc_index == 1 ? &vec_oprnds1
-					: &vec_oprnds2));
+	  if (i != reduc_index)
+	    {
+	      /* For non-reduction operand, deduce effictive copies that are
+		 involved in vectorized def-use cycles based on the input
+		 vectype of the reduction statement.  */
+	      used_ncopies = stmt_ncopies;
+	    }
+	  else if (single_defuse_cycle)
+	    {
+	      /* For single def-use cycles get one copy of the vectorized
+		 reduction definition.  */
+	      used_ncopies = 1;
+	    }
+
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
+					 op.ops[i], &vec_oprnds[i], vectype);
+
+	  if (used_ncopies < ncopies)
+	    vec_oprnds[i].safe_grow_cleared (ncopies);
+	}
     }
 
+  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  tree def0;
 
-  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
     {
       gimple *new_stmt;
-      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-      if (masked_loop_p && !mask_by_cond_expr)
+      tree new_temp = NULL_TREE;
+      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
+
+      if (!vop[0] || !vop[1])
+	{
+	  tree reduc_vop = vec_oprnds[reduc_index][i];
+
+	  /* Insert trivial copy if no need to generate vectorized
+	     statement.  */
+	  gcc_assert (reduc_vop && stmt_ncopies < ncopies);
+
+	  new_stmt = gimple_build_assign (vec_dest, reduc_vop);
+	  new_temp = make_ssa_name (vec_dest, new_stmt);
+	  gimple_set_lhs (new_stmt, new_temp);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+	}
+      else if (masked_loop_p && !mask_by_cond_expr)
 	{
-	  /* No conditional ifns have been defined for dot-product yet.  */
-	  gcc_assert (code != DOT_PROD_EXPR);
+	  /* No conditional ifns have been defined for dot-product and sad
+	     yet.  */
+	  gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
 
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)
@@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	      std::swap (vop[0], vop[1]);
 	    }
 	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype_in, i);
+					  vec_num * stmt_ncopies,
+					  stmt_vectype_in, i);
 	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
 						    vop[0], vop[1], vop[0]);
 	  new_temp = make_ssa_name (vec_dest, call);
@@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
 	{
 	  if (op.num_ops >= 3)
-	    vop[2] = vec_oprnds2[i];
+	    vop[2] = vec_oprnds[2][i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
 	    {
 	      tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					      vec_num * ncopies, vectype_in, i);
+					      vec_num * stmt_ncopies,
+					      stmt_vectype_in, i);
 	      build_vect_cond_expr (code, vop, mask, gsi);
 	    }
 
@@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
       if (slp_node)
 	slp_node->push_vec_def (new_stmt);
-      else if (single_defuse_cycle
-	       && i < ncopies - 1)
-	{
-	  if (reduc_index == 0)
-	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 1)
-	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 2)
-	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-	}
+      else if (single_defuse_cycle && i < ncopies - 1)
+	vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
       else
 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 2e0be763abb..cc0a832f71b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo,
 				      NULL, NULL, node, cost_vec)
 	  || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+	  || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+					 stmt_info, node, cost_vec)
 	  || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
 				     node, node_instance, cost_vec)
 	  || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 97ec9c341e7..ca810869592 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
 extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
 					 slp_tree, slp_instance, int,
 					 bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+					slp_tree, stmt_vector_for_cost *);
 extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
 				    slp_tree, slp_instance,
 				    stmt_vector_for_cost *);
-- 
2.17.1

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0005-vect-Support-multiple-lane-reducing-operations-for-l.patch --]
[-- Type: text/x-patch; name="0005-vect-Support-multiple-lane-reducing-operations-for-l.patch", Size: 44189 bytes --]

From 61c8a5c4e3bd3362b60f99cb8c0a3fd1d484014c Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Wed, 29 May 2024 17:22:36 +0800
Subject: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop
 reduction [PR114440]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
vectorizer could only handle the pattern if the reduction chain does not
contain other operation, no matter the other is normal or lane-reducing.

Actually, to allow multiple arbitray lane-reducing operations, we need to
support vectorization of loop reduction chain with mixed input vectypes. Since
lanes of vectype may vary with operation, the effective ncopies of vectorized
statements for operation also may not be same to each other, this causes
mismatch on vectorized def-use cycles. A simple way is to align all operations
with the one that has the most ncopies, the gap could be complemented by
generating extra trival pass-through copies. For example:

   int sum = 0;
   for (i)
     {
       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
       sum += w[i];               // widen-sum <vector(16) char>
       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
       sum += n[i];               // normal <vector(4) int>
     }

The vector size is 128-bit，vectorization factor is 16. Reduction statements
would be transformed as:

   vector<4> int sum_v0 = { 0, 0, 0, 0 };
   vector<4> int sum_v1 = { 0, 0, 0, 0 };
   vector<4> int sum_v2 = { 0, 0, 0, 0 };
   vector<4> int sum_v3 = { 0, 0, 0, 0 };

   for (i / 16)
     {
       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 += n_v0[i: 0  ~ 3 ];
       sum_v1 += n_v1[i: 4  ~ 7 ];
       sum_v2 += n_v2[i: 8  ~ 11];
       sum_v3 += n_v3[i: 12 ~ 15];
     }

2024-03-22 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	PR tree-optimization/114440
	* tree-vectorizer.h (vectorizable_lane_reducing): New function
	declaration.
	* tree-vect-stmts.cc (vect_analyze_stmt): Call new function
	vectorizable_lane_reducing to analyze lane-reducing operation.
	* tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
	code related to	emulated_mixed_dot_prod.
	(vectorizable_lane_reducing): New function.
	(vectorizable_reduction): Allow multiple lane-reducing operations in
	loop reduction. Move some original lane-reducing related code to
	vectorizable_lane_reducing.
	(vect_transform_reduction): Extend transformation to support reduction
	statements with mixed input vectypes.

gcc/testsuite/
	PR tree-optimization/114440
	* gcc.dg/vect/vect-reduc-chain-1.c
	* gcc.dg/vect/vect-reduc-chain-2.c
	* gcc.dg/vect/vect-reduc-chain-3.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
	* gcc.dg/vect/vect-reduc-dot-slp-1.c
---
 .../gcc.dg/vect/vect-reduc-chain-1.c          |  62 +++
 .../gcc.dg/vect/vect-reduc-chain-2.c          |  77 +++
 .../gcc.dg/vect/vect-reduc-chain-3.c          |  66 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  97 ++++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  81 +++
 .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  35 ++
 gcc/tree-vect-loop.cc                         | 478 ++++++++++++------
 gcc/tree-vect-stmts.cc                        |   2 +
 gcc/tree-vectorizer.h                         |   2 +
 9 files changed, 755 insertions(+), 145 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
new file mode 100644
index 00000000000..04bfc419dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_2 char *restrict c,
+   SIGNEDNESS_2 char *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_2 char c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = BASE + i * 2;
+      d[i] = BASE + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
new file mode 100644
index 00000000000..6c803b80120
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
@@ -0,0 +1,77 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 char *restrict c,
+   SIGNEDNESS_3 char *restrict d,
+   SIGNEDNESS_4 short *restrict e,
+   SIGNEDNESS_4 short *restrict f,
+   SIGNEDNESS_1 int *restrict g)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += i + 1;
+      res += c[i] * d[i];
+      res += e[i] * f[i];
+      res += g[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 char c[N], d[N];
+  SIGNEDNESS_4 short e[N], f[N];
+  SIGNEDNESS_1 int g[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += i + 1;
+      expected += c[i] * d[i];
+      expected += e[i] * f[i];
+      expected += g[i];
+    }
+  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
new file mode 100644
index 00000000000..a41e4b176c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
@@ -0,0 +1,66 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 short *restrict c,
+   SIGNEDNESS_3 short *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      res += abs;
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 short c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 - i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      expected += abs;
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
new file mode 100644
index 00000000000..51ef4eaaed8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
@@ -0,0 +1,97 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+      res += a[8] * b[8];
+      res += a[9] * b[9];
+      res += a[10] * b[10];
+      res += a[11] * b[11];
+      res += a[12] * b[12];
+      res += a[13] * b[13];
+      res += a[14] * b[14];
+      res += a[15] * b[15];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int step = 16;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      expected += a[t + 8] * b[t + 8];
+      expected += a[t + 9] * b[t + 9];
+      expected += a[t + 10] * b[t + 10];
+      expected += a[t + 11] * b[t + 11];
+      expected += a[t + 12] * b[t + 12];
+      expected += a[t + 13] * b[t + 13];
+      expected += a[t + 14] * b[t + 14];
+      expected += a[t + 15] * b[t + 15];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
new file mode 100644
index 00000000000..1532833c3ae
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
@@ -0,0 +1,81 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int step = 8;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
new file mode 100644
index 00000000000..e17d6291f75
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
@@ -0,0 +1,35 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res0,
+   SIGNEDNESS_1 int res1,
+   SIGNEDNESS_1 int res2,
+   SIGNEDNESS_1 int res3,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b)
+{
+  for (int i = 0; i < 64; i += 4)
+    {
+      res0 += a[i + 0] * b[i + 0];
+      res1 += a[i + 1] * b[i + 1];
+      res2 += a[i + 2] * b[i + 2];
+      res3 += a[i + 3] * b[i + 3];
+    }
+
+  return res0 ^ res1 ^ res2 ^ res3;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 20c99f11e9a..b5849dbb08a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
-  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 	   initial result of the data reduction, initial value of the index
 	   reduction.  */
 	prologue_stmts = 4;
-      else if (emulated_mixed_dot_prod)
-	/* We need the initial reduction value and two invariants:
-	   one that contains the minimum signed value and one that
-	   contains half of its negative.  */
-	prologue_stmts = 3;
       else
+	/* We need the initial reduction value.  */
 	prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
 					 scalar_to_vec, stmt_info, 0,
@@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
     }
 }
 
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
+   Now there are three such kinds of operations: dot-prod/widen-sum/sad
+   (sum-of-absolute-differences).
+
+   For a lane-reducing operation, the loop reduction path that it lies in,
+   may contain normal operation, or other lane-reducing operation of different
+   input type size, an example as:
+
+     int sum = 0;
+     for (i)
+       {
+         ...
+         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
+         sum += w[i];                // widen-sum <vector(16) char>
+         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
+         sum += n[i];                // normal <vector(4) int>
+         ...
+       }
+
+   Vectorization factor is essentially determined by operation whose input
+   vectype has the most lanes ("vector(16) char" in the example), while we
+   need to choose input vectype with the least lanes ("vector(4) int" in the
+   example) for the reduction PHI statement.  */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+			    slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!stmt)
+    return false;
+
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+
+  if (!lane_reducing_op_p (code))
+    return false;
+
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
+    return false;
+
+  /* Do not try to vectorize bit-precision reductions.  */
+  if (!type_has_mode_precision_p (type))
+    return false;
+
+  tree vectype_in = NULL_TREE;
+
+  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+    {
+      stmt_vec_info def_stmt_info;
+      slp_tree slp_op;
+      tree op;
+      tree vectype;
+      enum vect_def_type dt;
+
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+			       &slp_op, &dt, &vectype, &def_stmt_info))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "use not simple.\n");
+	  return false;
+	}
+
+      if (!vectype)
+	{
+	  vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+						 slp_op);
+	  if (!vectype)
+	    return false;
+	}
+
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "incompatible vector types for invariants\n");
+	  return false;
+	}
+
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+	continue;
+
+      /* There should be at most one cycle def in the stmt.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
+	return false;
+
+      /* To properly compute ncopies we are interested in the widest
+	 non-reduction input type in case we're looking at a widening
+	 accumulation that we later handle in vect transformation.  */
+      if (!vectype_in
+	  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+	      < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
+	vectype_in = vectype;
+    }
+
+  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
+
+  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+  /* TODO: Support lane-reducing operation that does not directly participate
+     in loop reduction. */
+  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+    return false;
+
+  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+     recoginized.  */
+  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+
+  /* To accommodate lane-reducing operations of mixed input vectypes, choose
+     input vectype with the least lanes for the reduction PHI statement, which
+     would result in the most ncopies for vectorized reduction results.  */
+  if (!vphi_vectype_in
+      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+	  > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
+    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
+  int ncopies_for_cost;
+
+  if (slp_node)
+    {
+      /* Now lane-reducing operations in a slp node should only come from
+	 the same loop reduction path.  */
+      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
+      ncopies_for_cost = 1;
+    }
+  else
+    {
+      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
+      gcc_assert (ncopies_for_cost >= 1);
+    }
+
+  if (vect_is_emulated_mixed_dot_prod (stmt_info))
+    {
+      /* We need extra two invariants: one that contains the minimum signed
+	 value and one that contains half of its negative.  */
+      int prologue_stmts = 2;
+      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+					scalar_to_vec, stmt_info, 0,
+					vect_prologue);
+      if (dump_enabled_p ())
+	dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+		     "extra prologue_cost = %d .\n", cost);
+
+      /* Three dot-products and a subtraction.  */
+      ncopies_for_cost *= 4;
+    }
+
+  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
+		    vect_body);
+
+  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
+				     type, vectype_in);
+
+  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+  return true;
+}
+
 /* Function vectorizable_reduction.
 
    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			       (gimple_bb (reduc_def_phi)->loop_father));
   unsigned reduc_chain_length = 0;
   bool only_slp_reduc_chain = true;
+  bool only_lane_reducing = true;
   stmt_info = NULL;
   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
   while (reduc_def != PHI_RESULT (reduc_def_phi))
@@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	      return false;
 	    }
 	}
-      else if (!stmt_info)
-	/* First non-conversion stmt.  */
-	stmt_info = vdef;
+      else
+	{
+	  /* First non-conversion stmt.  */
+	  if (!stmt_info)
+	    stmt_info = vdef;
+
+	  if (!lane_reducing_op_p (op.code))
+	    only_lane_reducing = false;
+	}
+
       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
@@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (!type_has_mode_precision_p (op.type))
     return false;
 
-  /* For lane-reducing ops we're reducing the number of reduction PHIs
-     which means the only use of that may be in the lane-reducing operation.  */
-  if (lane_reducing
-      && reduc_chain_length != 1
-      && !only_slp_reduc_chain)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "lane-reducing reduction with extra stmts.\n");
-      return false;
-    }
-
   /* Lane-reducing ops also never can be used in a SLP reduction group
      since we'll mix lanes belonging to different reductions.  But it's
      OK to use them in a reduction chain or when the reduction group
@@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			     "use not simple.\n");
 	  return false;
 	}
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-	continue;
-
       /* For an IFN_COND_OP we might hit the reduction definition operand
 	 twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
@@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
   if (!vectype_in)
     vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
 
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
-
-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  /* If there is a normal (non-lane-reducing) operation in the loop reduction
+     path, to ensure there will be enough copies to hold vectorized results of
+     the operation, we need set the input vectype of the reduction PHI to be
+     same as the reduction output vectype somewhere, here is a suitable place.
+     Otherwise the input vectype is set to the one with the least lanes, which
+     can only be determined in vectorizable analysis routine of lane-reducing
+     operation.  */
+  if (!only_lane_reducing)
+    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
+
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node)
 	return false;
@@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
 
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
 
-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.
@@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;
 
-  if (single_defuse_cycle || lane_reducing)
+  if (single_defuse_cycle && !lane_reducing)
     {
       gcc_assert (op.code != COND_EXPR);
 
-      /* 4. Supportable by target?  */
-      bool ok = true;
-
-      /* 4.1. check support for the operation in the loop
+      /* 4. check support for the operation in the loop
 
 	 This isn't necessary for the lane reduction codes, since they
 	 can only be produced by pattern matching, and it's up to the
@@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	 mixed-sign dot-products can be implemented using signed
 	 dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!lane_reducing
-	  && !directly_supported_p (op.code, vectype_in, optab_vector))
+      if (!directly_supported_p (op.code, vectype_in, optab_vector))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
 	      || !vect_can_vectorize_without_simd_p (op.code))
-	    ok = false;
+	    single_defuse_cycle = false;
 	  else
 	    if (dump_enabled_p ())
 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
 	  return false;
 	}
-
-      /* lane-reducing operations have to go through vect_transform_reduction.
-         For the other cases try without the single cycle optimization.  */
-      if (!ok)
-	{
-	  if (lane_reducing)
-	    return false;
-	  else
-	    single_defuse_cycle = false;
-	}
     }
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
 
-  /* If the reduction stmt is one of the patterns that have lane
-     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1 && ! single_defuse_cycle)
-      && lane_reducing)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "multi def-use cycle not possible for lane-reducing "
-			 "reduction operation\n");
-      return false;
-    }
-
-  if (slp_node
-      && !(!single_defuse_cycle
-	   && !lane_reducing
-	   && reduction_type != FOLD_LEFT_REDUCTION))
+  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
+     below processing will be done in its own vectorizable function.  */
+  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
     for (i = 0; i < (int) op.num_ops; i++)
       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
 	{
@@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
 			     reduction_type, ncopies, cost_vec);
   /* Cost the reduction op inside the loop if transformed via
-     vect_transform_reduction.  Otherwise this is costed by the
-     separate vectorizable_* routines.  */
-  if (single_defuse_cycle || lane_reducing)
-    {
-      int factor = 1;
-      if (vect_is_emulated_mixed_dot_prod (stmt_info))
-	/* Three dot-products and a subtraction.  */
-	factor = 4;
-      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
-			stmt_info, 0, vect_body);
-    }
+     vect_transform_reduction for non-lane-reducing operation.  Otherwise
+     this is costed by the separate vectorizable_* routines.  */
+  if (single_defuse_cycle && !lane_reducing)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "using an in-order (fold-left) reduction.\n");
   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
-  /* All but single defuse-cycle optimized, lane-reducing and fold-left
-     reductions go through their own vectorizable_* routines.  */
-  if (!single_defuse_cycle
-      && !lane_reducing
-      && reduction_type != FOLD_LEFT_REDUCTION)
+
+  /* All but single defuse-cycle optimized and fold-left reductions go
+     through their own vectorizable_* routines.  */
+  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
+      || lane_reducing)
     {
       stmt_vec_info tem
 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
+  int stmt_ncopies;
   int vec_num;
 
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+
+  /* Get input vectypes from the reduction PHI and the statement to be
+     transformed, these two vectypes may have different lanes when
+     lane-reducing operation is present.  */
+  if (!vectype_in)
+    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
+
+  if (!stmt_vectype_in)
+    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
 
   if (slp_node)
     {
       ncopies = 1;
+      stmt_ncopies = 1;
       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
     }
   else
     {
       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
+      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
       vec_num = 1;
     }
 
@@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
-  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
-
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
+						    stmt_vectype_in);
   /* Transform.  */
-  tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
-  tree def0;
+  auto_vec<tree> vec_oprnds[3];
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 		      == op.ops[internal_fn_else_index ((internal_fn) code)]));
     }
 
-  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
-
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
 	   reduc_index, masks, lens);
     }
 
@@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
-  /* Get NCOPIES vector definitions for all operands except the reduction
-     definition.  */
-  if (!cond_fn_p)
+  gcc_assert (reduc_index < 3);
+
+  if (slp_node)
     {
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 single_defuse_cycle && reduc_index == 0
-			 ? NULL_TREE : op.ops[0], &vec_oprnds0,
-			 single_defuse_cycle && reduc_index == 1
-			 ? NULL_TREE : op.ops[1], &vec_oprnds1,
-			 op.num_ops == 3
-			 && !(single_defuse_cycle && reduc_index == 2)
-			 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
+
+      for (i = 0; i < (int) op.num_ops; i++)
+	vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
     }
   else
     {
-      /* For a conditional operation pass the truth type as mask
-	 vectype.  */
-      gcc_assert (single_defuse_cycle
-		  && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
-			 reduc_index == 1 ? NULL_TREE : op.ops[1],
-			 NULL_TREE, &vec_oprnds1,
-			 reduc_index == 2 ? NULL_TREE : op.ops[2],
-			 NULL_TREE, &vec_oprnds2);
-    }
+      /* The input vectype of the reduction PHI determines copies of
+	 vectorized def-use cycles, which might be more than effective copies
+	 of vectorized lane-reducing reduction statements.  This could be
+	 complemented by generating extra trivial pass-through copies.  For
+	 example:
+
+	   int sum = 0;
+	   for (i)
+	     {
+	       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
+	       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+	       sum += n[i];               // normal <vector(4) int>
+	     }
+
+	 The vector size is 128-bit，vectorization factor is 16.  Reduction
+	 statements would be transformed as:
+
+	   vector<4> int sum_v0 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v1 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v2 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v3 = { 0, 0, 0, 0 };
+
+	   for (i / 16)
+	     {
+	       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
+	       sum_v1 = sum_v1;  // copy
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+	       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 += n_v0[i: 0  ~ 3 ];
+	       sum_v1 += n_v1[i: 4  ~ 7 ];
+	       sum_v2 += n_v2[i: 8  ~ 11];
+	       sum_v3 += n_v3[i: 12 ~ 15];
+	     }
+	*/
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+	{
+	  tree vectype = NULL_TREE;
+	  int used_ncopies = ncopies;
+
+	  if (cond_fn_p && i == 0)
+	    {
+	      /* For a conditional operation pass the truth type as mask
+		 vectype.  */
+	      gcc_assert (single_defuse_cycle && reduc_index > 0);
+	      vectype = truth_type_for (vectype_in);
+	    }
 
-  /* For single def-use cycles get one copy of the vectorized reduction
-     definition.  */
-  if (single_defuse_cycle)
-    {
-      gcc_assert (!slp_node);
-      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-				     op.ops[reduc_index],
-				     reduc_index == 0 ? &vec_oprnds0
-				     : (reduc_index == 1 ? &vec_oprnds1
-					: &vec_oprnds2));
+	  if (i != reduc_index)
+	    {
+	      /* For non-reduction operand, deduce effictive copies that are
+		 involved in vectorized def-use cycles based on the input
+		 vectype of the reduction statement.  */
+	      used_ncopies = stmt_ncopies;
+	    }
+	  else if (single_defuse_cycle)
+	    {
+	      /* For single def-use cycles get one copy of the vectorized
+		 reduction definition.  */
+	      used_ncopies = 1;
+	    }
+
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
+					 op.ops[i], &vec_oprnds[i], vectype);
+
+	  if (used_ncopies < ncopies)
+	    vec_oprnds[i].safe_grow_cleared (ncopies);
+	}
     }
 
+  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  tree def0;
 
-  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
     {
       gimple *new_stmt;
-      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-      if (masked_loop_p && !mask_by_cond_expr)
+      tree new_temp = NULL_TREE;
+      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
+
+      if (!vop[0] || !vop[1])
+	{
+	  tree reduc_vop = vec_oprnds[reduc_index][i];
+
+	  /* Insert trivial copy if no need to generate vectorized
+	     statement.  */
+	  gcc_assert (reduc_vop && stmt_ncopies < ncopies);
+
+	  new_stmt = gimple_build_assign (vec_dest, reduc_vop);
+	  new_temp = make_ssa_name (vec_dest, new_stmt);
+	  gimple_set_lhs (new_stmt, new_temp);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+	}
+      else if (masked_loop_p && !mask_by_cond_expr)
 	{
-	  /* No conditional ifns have been defined for dot-product yet.  */
-	  gcc_assert (code != DOT_PROD_EXPR);
+	  /* No conditional ifns have been defined for dot-product and sad
+	     yet.  */
+	  gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
 
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)
@@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	      std::swap (vop[0], vop[1]);
 	    }
 	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype_in, i);
+					  vec_num * stmt_ncopies,
+					  stmt_vectype_in, i);
 	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
 						    vop[0], vop[1], vop[0]);
 	  new_temp = make_ssa_name (vec_dest, call);
@@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
 	{
 	  if (op.num_ops >= 3)
-	    vop[2] = vec_oprnds2[i];
+	    vop[2] = vec_oprnds[2][i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
 	    {
 	      tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					      vec_num * ncopies, vectype_in, i);
+					      vec_num * stmt_ncopies,
+					      stmt_vectype_in, i);
 	      build_vect_cond_expr (code, vop, mask, gsi);
 	    }
 
@@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
       if (slp_node)
 	slp_node->push_vec_def (new_stmt);
-      else if (single_defuse_cycle
-	       && i < ncopies - 1)
-	{
-	  if (reduc_index == 0)
-	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 1)
-	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 2)
-	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-	}
+      else if (single_defuse_cycle && i < ncopies - 1)
+	vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
       else
 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 2e0be763abb..cc0a832f71b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo,
 				      NULL, NULL, node, cost_vec)
 	  || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+	  || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+					 stmt_info, node, cost_vec)
 	  || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
 				     node, node_instance, cost_vec)
 	  || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 97ec9c341e7..ca810869592 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
 extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
 					 slp_tree, slp_instance, int,
 					 bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+					slp_tree, stmt_vector_for_cost *);
 extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
 				    slp_tree, slp_instance,
 				    stmt_vector_for_cost *);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]
  2024-05-30 14:54 [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440] Feng Xue OS
@ 2024-05-31 14:57 ` Richard Biener
  2024-06-02 14:13   ` Feng Xue OS
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2024-05-31 14:57 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: Tamar Christina, gcc-patches

On Thu, May 30, 2024 at 4:55 PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
> vectorizer could only handle the pattern if the reduction chain does not
> contain other operation, no matter the other is normal or lane-reducing.
>
> Actually, to allow multiple arbitray lane-reducing operations, we need to
> support vectorization of loop reduction chain with mixed input vectypes. Since
> lanes of vectype may vary with operation, the effective ncopies of vectorized
> statements for operation also may not be same to each other, this causes
> mismatch on vectorized def-use cycles. A simple way is to align all operations
> with the one that has the most ncopies, the gap could be complemented by
> generating extra trival pass-through copies. For example:
>
>    int sum = 0;
>    for (i)
>      {
>        sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
>        sum += w[i];               // widen-sum <vector(16) char>
>        sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
>        sum += n[i];               // normal <vector(4) int>
>      }
>
> The vector size is 128-bit，vectorization factor is 16. Reduction statements
> would be transformed as:
>
>    vector<4> int sum_v0 = { 0, 0, 0, 0 };
>    vector<4> int sum_v1 = { 0, 0, 0, 0 };
>    vector<4> int sum_v2 = { 0, 0, 0, 0 };
>    vector<4> int sum_v3 = { 0, 0, 0, 0 };
>
>    for (i / 16)
>      {
>        sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
>        sum_v1 = sum_v1;  // copy
>        sum_v2 = sum_v2;  // copy
>        sum_v3 = sum_v3;  // copy
>
>        sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
>        sum_v1 = sum_v1;  // copy
>        sum_v2 = sum_v2;  // copy
>        sum_v3 = sum_v3;  // copy
>
>        sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
>        sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
>        sum_v2 = sum_v2;  // copy
>        sum_v3 = sum_v3;  // copy
>
>        sum_v0 += n_v0[i: 0  ~ 3 ];
>        sum_v1 += n_v1[i: 4  ~ 7 ];
>        sum_v2 += n_v2[i: 8  ~ 11];
>        sum_v3 += n_v3[i: 12 ~ 15];
>      }
>
> Thanks,
> Feng
> ---
> gcc/
>         PR tree-optimization/114440
>         * tree-vectorizer.h (vectorizable_lane_reducing): New function
>         declaration.
>         * tree-vect-stmts.cc (vect_analyze_stmt): Call new function
>         vectorizable_lane_reducing to analyze lane-reducing operation.
>         * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
>         code related to emulated_mixed_dot_prod.
>         (vectorizable_lane_reducing): New function.
>         (vectorizable_reduction): Allow multiple lane-reducing operations in
>         loop reduction. Move some original lane-reducing related code to
>         vectorizable_lane_reducing.
>         (vect_transform_reduction): Extend transformation to support reduction
>         statements with mixed input vectypes.
>
> gcc/testsuite/
>         PR tree-optimization/114440
>         * gcc.dg/vect/vect-reduc-chain-1.c
>         * gcc.dg/vect/vect-reduc-chain-2.c
>         * gcc.dg/vect/vect-reduc-chain-3.c
>         * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
>         * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
>         * gcc.dg/vect/vect-reduc-dot-slp-1.c
> ---
>  .../gcc.dg/vect/vect-reduc-chain-1.c          |  62 +++
>  .../gcc.dg/vect/vect-reduc-chain-2.c          |  77 +++
>  .../gcc.dg/vect/vect-reduc-chain-3.c          |  66 +++
>  .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  97 ++++
>  .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  81 +++
>  .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  35 ++
>  gcc/tree-vect-loop.cc                         | 478 ++++++++++++------
>  gcc/tree-vect-stmts.cc                        |   2 +
>  gcc/tree-vectorizer.h                         |   2 +
>  9 files changed, 755 insertions(+), 145 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
> new file mode 100644
> index 00000000000..04bfc419dbd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
> @@ -0,0 +1,62 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *restrict a,
> +   SIGNEDNESS_2 char *restrict b,
> +   SIGNEDNESS_2 char *restrict c,
> +   SIGNEDNESS_2 char *restrict d,
> +   SIGNEDNESS_1 int *restrict e)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      res += a[i] * b[i];
> +      res += c[i] * d[i];
> +      res += e[i];
> +    }
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[N], b[N];
> +  SIGNEDNESS_2 char c[N], d[N];
> +  SIGNEDNESS_1 int e[N];
> +  int expected = 0x12345;
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +      c[i] = BASE + i * 2;
> +      d[i] = BASE + OFFSET + i * 3;
> +      e[i] = i;
> +      asm volatile ("" ::: "memory");
> +      expected += a[i] * b[i];
> +      expected += c[i] * d[i];
> +      expected += e[i];
> +    }
> +  if (f (0x12345, a, b, c, d, e) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
> new file mode 100644
> index 00000000000..6c803b80120
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
> @@ -0,0 +1,77 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 unsigned
> +#define SIGNEDNESS_3 signed
> +#define SIGNEDNESS_4 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +fn (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *restrict a,
> +   SIGNEDNESS_2 char *restrict b,
> +   SIGNEDNESS_3 char *restrict c,
> +   SIGNEDNESS_3 char *restrict d,
> +   SIGNEDNESS_4 short *restrict e,
> +   SIGNEDNESS_4 short *restrict f,
> +   SIGNEDNESS_1 int *restrict g)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      res += a[i] * b[i];
> +      res += i + 1;
> +      res += c[i] * d[i];
> +      res += e[i] * f[i];
> +      res += g[i];
> +    }
> +  return res;
> +}
> +
> +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
> +#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[N], b[N];
> +  SIGNEDNESS_3 char c[N], d[N];
> +  SIGNEDNESS_4 short e[N], f[N];
> +  SIGNEDNESS_1 int g[N];
> +  int expected = 0x12345;
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE2 + i * 5;
> +      b[i] = BASE2 + OFFSET + i * 4;
> +      c[i] = BASE3 + i * 2;
> +      d[i] = BASE3 + OFFSET + i * 3;
> +      e[i] = BASE4 + i * 6;
> +      f[i] = BASE4 + OFFSET + i * 5;
> +      g[i] = i;
> +      asm volatile ("" ::: "memory");
> +      expected += a[i] * b[i];
> +      expected += i + 1;
> +      expected += c[i] * d[i];
> +      expected += e[i] * f[i];
> +      expected += g[i];
> +    }
> +  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
> new file mode 100644
> index 00000000000..a41e4b176c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
> @@ -0,0 +1,66 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 unsigned
> +#define SIGNEDNESS_3 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *restrict a,
> +   SIGNEDNESS_2 char *restrict b,
> +   SIGNEDNESS_3 short *restrict c,
> +   SIGNEDNESS_3 short *restrict d,
> +   SIGNEDNESS_1 int *restrict e)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      short diff = a[i] - b[i];
> +      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
> +      res += abs;
> +      res += c[i] * d[i];
> +      res += e[i];
> +    }
> +  return res;
> +}
> +
> +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[N], b[N];
> +  SIGNEDNESS_3 short c[N], d[N];
> +  SIGNEDNESS_1 int e[N];
> +  int expected = 0x12345;
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE2 + i * 5;
> +      b[i] = BASE2 - i * 4;
> +      c[i] = BASE3 + i * 2;
> +      d[i] = BASE3 + OFFSET + i * 3;
> +      e[i] = i;
> +      asm volatile ("" ::: "memory");
> +      short diff = a[i] - b[i];
> +      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
> +      expected += abs;
> +      expected += c[i] * d[i];
> +      expected += e[i];
> +    }
> +  if (f (0x12345, a, b, c, d, e) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
> new file mode 100644
> index 00000000000..51ef4eaaed8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
> @@ -0,0 +1,97 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *a,
> +   SIGNEDNESS_2 char *b,
> +   int step, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      res += a[0] * b[0];
> +      res += a[1] * b[1];
> +      res += a[2] * b[2];
> +      res += a[3] * b[3];
> +      res += a[4] * b[4];
> +      res += a[5] * b[5];
> +      res += a[6] * b[6];
> +      res += a[7] * b[7];
> +      res += a[8] * b[8];
> +      res += a[9] * b[9];
> +      res += a[10] * b[10];
> +      res += a[11] * b[11];
> +      res += a[12] * b[12];
> +      res += a[13] * b[13];
> +      res += a[14] * b[14];
> +      res += a[15] * b[15];
> +
> +      a += step;
> +      b += step;
> +    }
> +
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[100], b[100];
> +  int expected = 0x12345;
> +  int step = 16;
> +  int n = 2;
> +  int t = 0;
> +
> +  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  for (int i = 0; i < n; i++)
> +    {
> +      asm volatile ("" ::: "memory");
> +      expected += a[t + 0] * b[t + 0];
> +      expected += a[t + 1] * b[t + 1];
> +      expected += a[t + 2] * b[t + 2];
> +      expected += a[t + 3] * b[t + 3];
> +      expected += a[t + 4] * b[t + 4];
> +      expected += a[t + 5] * b[t + 5];
> +      expected += a[t + 6] * b[t + 6];
> +      expected += a[t + 7] * b[t + 7];
> +      expected += a[t + 8] * b[t + 8];
> +      expected += a[t + 9] * b[t + 9];
> +      expected += a[t + 10] * b[t + 10];
> +      expected += a[t + 11] * b[t + 11];
> +      expected += a[t + 12] * b[t + 12];
> +      expected += a[t + 13] * b[t + 13];
> +      expected += a[t + 14] * b[t + 14];
> +      expected += a[t + 15] * b[t + 15];
> +      t += step;
> +    }
> +
> +  if (f (0x12345, a, b, step, n) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
> new file mode 100644
> index 00000000000..1532833c3ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
> @@ -0,0 +1,81 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 short *a,
> +   SIGNEDNESS_2 short *b,
> +   int step, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      res += a[0] * b[0];
> +      res += a[1] * b[1];
> +      res += a[2] * b[2];
> +      res += a[3] * b[3];
> +      res += a[4] * b[4];
> +      res += a[5] * b[5];
> +      res += a[6] * b[6];
> +      res += a[7] * b[7];
> +
> +      a += step;
> +      b += step;
> +    }
> +
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 short a[100], b[100];
> +  int expected = 0x12345;
> +  int step = 8;
> +  int n = 2;
> +  int t = 0;
> +
> +  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  for (int i = 0; i < n; i++)
> +    {
> +      asm volatile ("" ::: "memory");
> +      expected += a[t + 0] * b[t + 0];
> +      expected += a[t + 1] * b[t + 1];
> +      expected += a[t + 2] * b[t + 2];
> +      expected += a[t + 3] * b[t + 3];
> +      expected += a[t + 4] * b[t + 4];
> +      expected += a[t + 5] * b[t + 5];
> +      expected += a[t + 6] * b[t + 6];
> +      expected += a[t + 7] * b[t + 7];
> +      t += step;
> +    }
> +
> +  if (f (0x12345, a, b, step, n) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
> new file mode 100644
> index 00000000000..e17d6291f75
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
> @@ -0,0 +1,35 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-do compile } */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res0,
> +   SIGNEDNESS_1 int res1,
> +   SIGNEDNESS_1 int res2,
> +   SIGNEDNESS_1 int res3,
> +   SIGNEDNESS_2 short *a,
> +   SIGNEDNESS_2 short *b)
> +{
> +  for (int i = 0; i < 64; i += 4)
> +    {
> +      res0 += a[i + 0] * b[i + 0];
> +      res1 += a[i + 1] * b[i + 1];
> +      res2 += a[i + 2] * b[i + 2];
> +      res3 += a[i + 3] * b[i + 3];
> +    }
> +
> +  return res0 ^ res1 ^ res2 ^ res3;
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "vect" } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 20c99f11e9a..b5849dbb08a 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
>      gcc_unreachable ();
>
> -  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> -
>    if (reduction_type == EXTRACT_LAST_REDUCTION)
>      /* No extra instructions are needed in the prologue.  The loop body
>         operations are costed in vectorizable_condition.  */
> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>            initial result of the data reduction, initial value of the index
>            reduction.  */
>         prologue_stmts = 4;
> -      else if (emulated_mixed_dot_prod)
> -       /* We need the initial reduction value and two invariants:
> -          one that contains the minimum signed value and one that
> -          contains half of its negative.  */
> -       prologue_stmts = 3;
>        else
> +       /* We need the initial reduction value.  */
>         prologue_stmts = 1;
>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
>                                          scalar_to_vec, stmt_info, 0,
> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
>      }
>  }
>
> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
> +   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
> +   Now there are three such kinds of operations: dot-prod/widen-sum/sad
> +   (sum-of-absolute-differences).
> +
> +   For a lane-reducing operation, the loop reduction path that it lies in,
> +   may contain normal operation, or other lane-reducing operation of different
> +   input type size, an example as:
> +
> +     int sum = 0;
> +     for (i)
> +       {
> +         ...
> +         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
> +         sum += w[i];                // widen-sum <vector(16) char>
> +         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
> +         sum += n[i];                // normal <vector(4) int>
> +         ...
> +       }
> +
> +   Vectorization factor is essentially determined by operation whose input
> +   vectype has the most lanes ("vector(16) char" in the example), while we
> +   need to choose input vectype with the least lanes ("vector(4) int" in the
> +   example) for the reduction PHI statement.  */
> +
> +bool
> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
> +                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> +{
> +  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
> +  if (!stmt)
> +    return false;
> +
> +  enum tree_code code = gimple_assign_rhs_code (stmt);
> +
> +  if (!lane_reducing_op_p (code))
> +    return false;

Can you make sure to return false if STMT_VINFO_REDUC_IDX == -1
thus the op is not part of a reduction chain/path?

> +  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
> +
> +  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
> +    return false;
> +
> +  /* Do not try to vectorize bit-precision reductions.  */
> +  if (!type_has_mode_precision_p (type))
> +    return false;
> +
> +  tree vectype_in = NULL_TREE;
> +
> +  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
> +    {
> +      stmt_vec_info def_stmt_info;
> +      slp_tree slp_op;
> +      tree op;
> +      tree vectype;
> +      enum vect_def_type dt;
> +
> +      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
> +                              &slp_op, &dt, &vectype, &def_stmt_info))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "use not simple.\n");
> +         return false;
> +       }
> +
> +      if (!vectype)
> +       {
> +         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
> +                                                slp_op);
> +         if (!vectype)
> +           return false;
> +       }
> +
> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))

Please avoid this during transform.

> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "incompatible vector types for invariants\n");
> +         return false;
> +       }
> +
> +      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> +       continue;
> +
> +      /* There should be at most one cycle def in the stmt.  */
> +      if (VECTORIZABLE_CYCLE_DEF (dt))
> +       return false;
> +
> +      /* To properly compute ncopies we are interested in the widest
> +        non-reduction input type in case we're looking at a widening
> +        accumulation that we later handle in vect transformation.  */
> +      if (!vectype_in
> +         || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> +             < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
> +       vectype_in = vectype;
> +    }
> +
> +  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;

As said below I wonder where we would need STMT_VINFO_REDUC_VECTYPE_IN.
At least you should avoid re-setting this when !cost_vec aka during transform,
possibly instead asserting you re-compute the same type (or simply
skip the above
loop and set vectype_in from STMT_VINFO_REDUC_VECTYPE_IN which then
gets a good use).

> +  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
> +
> +  /* TODO: Support lane-reducing operation that does not directly participate
> +     in loop reduction. */
> +  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
> +    return false;
> +
> +  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
> +     recoginized.  */
> +  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
> +  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
> +
> +  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> +
> +  /* To accommodate lane-reducing operations of mixed input vectypes, choose
> +     input vectype with the least lanes for the reduction PHI statement, which
> +     would result in the most ncopies for vectorized reduction results.  */
> +  if (!vphi_vectype_in
> +      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> +         > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;

Likewise.

> +  int ncopies_for_cost;
> +
> +  if (slp_node)
> +    {
> +      /* Now lane-reducing operations in a slp node should only come from
> +        the same loop reduction path.  */
> +      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
> +      ncopies_for_cost = 1;
> +    }
> +  else
> +    {
> +      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
> +      gcc_assert (ncopies_for_cost >= 1);
> +    }
> +
> +  if (vect_is_emulated_mixed_dot_prod (stmt_info))
> +    {
> +      /* We need extra two invariants: one that contains the minimum signed
> +        value and one that contains half of its negative.  */
> +      int prologue_stmts = 2;
> +      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
> +                                       scalar_to_vec, stmt_info, 0,
> +                                       vect_prologue);
> +      if (dump_enabled_p ())
> +       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
> +                    "extra prologue_cost = %d .\n", cost);
> +
> +      /* Three dot-products and a subtraction.  */
> +      ncopies_for_cost *= 4;
> +    }
> +
> +  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
> +                   vect_body);
> +
> +  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
> +                                    type, vectype_in);
> +
> +  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;

Uh, so those all go through vect_transform_reduction.  I see.

I fail to see a check for whether the target supports the lane-reducing op.
vectorizable_reduction only checks the last one.  Currently the check
might be redundant with what pattern recognition checks but it's still
incomplete compared to the check in vectorizable_reduction.

> +  return true;
> +}
> +
>  /* Function vectorizable_reduction.
>
>     Check if STMT_INFO performs a reduction operation that can be vectorized.
> @@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>                                (gimple_bb (reduc_def_phi)->loop_father));
>    unsigned reduc_chain_length = 0;
>    bool only_slp_reduc_chain = true;
> +  bool only_lane_reducing = true;
>    stmt_info = NULL;
>    slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
>    while (reduc_def != PHI_RESULT (reduc_def_phi))
> @@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>               return false;
>             }
>         }
> -      else if (!stmt_info)
> -       /* First non-conversion stmt.  */
> -       stmt_info = vdef;
> +      else
> +       {
> +         /* First non-conversion stmt.  */
> +         if (!stmt_info)
> +           stmt_info = vdef;
> +
> +         if (!lane_reducing_op_p (op.code))
> +           only_lane_reducing = false;
> +       }
> +
>        reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
>        reduc_chain_length++;
>        if (!stmt_info && slp_node)
> @@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    if (!type_has_mode_precision_p (op.type))
>      return false;
>
> -  /* For lane-reducing ops we're reducing the number of reduction PHIs
> -     which means the only use of that may be in the lane-reducing operation.  */
> -  if (lane_reducing
> -      && reduc_chain_length != 1
> -      && !only_slp_reduc_chain)
> -    {
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "lane-reducing reduction with extra stmts.\n");
> -      return false;
> -    }
> -
>    /* Lane-reducing ops also never can be used in a SLP reduction group
>       since we'll mix lanes belonging to different reductions.  But it's
>       OK to use them in a reduction chain or when the reduction group
> @@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>                              "use not simple.\n");
>           return false;
>         }
> -      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> -       continue;
> -

So within this loop we analyze the "main" operation, while I do not exactly
remember why we skip the op leading to the PHI I don't understand why you
want to look at it for the multi lane-reducing case (the accumulator
always has the same type, no?).

In any case this just looks at a single (the last) lane-reducing or even
not lane-reducing op.

>        /* For an IFN_COND_OP we might hit the reduction definition operand
>          twice (once as definition, once as else).  */
>        if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> @@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>      }
>    if (!vectype_in)
>      vectype_in = STMT_VINFO_VECTYPE (phi_info);
> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
>
> -  /* Each lane-reducing operation has its own input vectype, while reduction
> -     PHI records the input vectype with least lanes.  */
> -  if (lane_reducing)
> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> -
> -  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
> -  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
> +  /* If there is a normal (non-lane-reducing) operation in the loop reduction
> +     path, to ensure there will be enough copies to hold vectorized results of
> +     the operation, we need set the input vectype of the reduction PHI to be
> +     same as the reduction output vectype somewhere, here is a suitable place.
> +     Otherwise the input vectype is set to the one with the least lanes, which
> +     can only be determined in vectorizable analysis routine of lane-reducing
> +     operation.  */

But we are using vectype_in to compute ncopies which is used in cost analysis.
You say this might not be the final ncopies?  Note the vectorization factor is
already fixed as well as (output) vector types of the lane-reducing ops.  So
shouldn't we simply pick that up in the loop walking the use-def chain via
REDUC_IDX at the start of this function?  I'm unsure as to why we need
STMT_VINFO_REDUC_VECTYPE_IN at all (I don't remember adding that),
it should be readily available from operand analysis.  The docs for that
isn't very enlightening either (there's also REDUC_VECTYPE, in addition
to VECTYPE - huh).

> +  if (!only_lane_reducing)
> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
> +
> +  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
> +  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
>    /* If we have a condition reduction, see if we can simplify it further.  */
> -  if (v_reduc_type == COND_REDUCTION)
> +  if (reduction_type == COND_REDUCTION)
>      {
>        if (slp_node)
>         return false;
> @@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>      }
>
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
> +  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>
> -  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == TREE_CODE_REDUCTION)
>      {
>        /* Check whether it's ok to change the order of the computation.
> @@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        && loop_vinfo->suggested_unroll_factor == 1)
>      single_defuse_cycle = true;
>
> -  if (single_defuse_cycle || lane_reducing)
> +  if (single_defuse_cycle && !lane_reducing)
>      {
>        gcc_assert (op.code != COND_EXPR);
>
> -      /* 4. Supportable by target?  */
> -      bool ok = true;
> -
> -      /* 4.1. check support for the operation in the loop
> +      /* 4. check support for the operation in the loop
>
>          This isn't necessary for the lane reduction codes, since they
>          can only be produced by pattern matching, and it's up to the
> @@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>          mixed-sign dot-products can be implemented using signed
>          dot-products.  */
>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> -      if (!lane_reducing
> -         && !directly_supported_p (op.code, vectype_in, optab_vector))
> +      if (!directly_supported_p (op.code, vectype_in, optab_vector))
>          {
>            if (dump_enabled_p ())
>              dump_printf (MSG_NOTE, "op not supported by target.\n");
>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
>               || !vect_can_vectorize_without_simd_p (op.code))
> -           ok = false;
> +           single_defuse_cycle = false;
>           else
>             if (dump_enabled_p ())
>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
> @@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
>           return false;
>         }
> -
> -      /* lane-reducing operations have to go through vect_transform_reduction.
> -         For the other cases try without the single cycle optimization.  */
> -      if (!ok)
> -       {
> -         if (lane_reducing)
> -           return false;
> -         else
> -           single_defuse_cycle = false;
> -       }
>      }
>    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
>
> -  /* If the reduction stmt is one of the patterns that have lane
> -     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
> -  if ((ncopies > 1 && ! single_defuse_cycle)
> -      && lane_reducing)
> -    {
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "multi def-use cycle not possible for lane-reducing "
> -                        "reduction operation\n");
> -      return false;
> -    }
> -
> -  if (slp_node
> -      && !(!single_defuse_cycle
> -          && !lane_reducing
> -          && reduction_type != FOLD_LEFT_REDUCTION))
> +  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
> +     below processing will be done in its own vectorizable function.  */
> +  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
>      for (i = 0; i < (int) op.num_ops; i++)
>        if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
>         {
> @@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
>                              reduction_type, ncopies, cost_vec);
>    /* Cost the reduction op inside the loop if transformed via
> -     vect_transform_reduction.  Otherwise this is costed by the
> -     separate vectorizable_* routines.  */
> -  if (single_defuse_cycle || lane_reducing)
> -    {
> -      int factor = 1;
> -      if (vect_is_emulated_mixed_dot_prod (stmt_info))
> -       /* Three dot-products and a subtraction.  */
> -       factor = 4;
> -      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
> -                       stmt_info, 0, vect_body);
> -    }
> +     vect_transform_reduction for non-lane-reducing operation.  Otherwise
> +     this is costed by the separate vectorizable_* routines.  */
> +  if (single_defuse_cycle && !lane_reducing)
> +    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
>
>    if (dump_enabled_p ()
>        && reduction_type == FOLD_LEFT_REDUCTION)
>      dump_printf_loc (MSG_NOTE, vect_location,
>                      "using an in-order (fold-left) reduction.\n");
>    STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
> -  /* All but single defuse-cycle optimized, lane-reducing and fold-left
> -     reductions go through their own vectorizable_* routines.  */
> -  if (!single_defuse_cycle
> -      && !lane_reducing
> -      && reduction_type != FOLD_LEFT_REDUCTION)
> +
> +  /* All but single defuse-cycle optimized and fold-left reductions go
> +     through their own vectorizable_* routines.  */
> +  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
> +      || lane_reducing)

So single-def-use-cycle but lane-reducing ops no longer need
to go through vect_transform_reduction?  How do you handle those
but fail to handle non-lane-reducing ops this way?

>      {
>        stmt_vec_info tem
>         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
> @@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>    int i;
>    int ncopies;
> +  int stmt_ncopies;
>    int vec_num;
>
>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> @@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
>    int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
>    tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> +  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
> +
> +  /* Get input vectypes from the reduction PHI and the statement to be
> +     transformed, these two vectypes may have different lanes when
> +     lane-reducing operation is present.  */
> +  if (!vectype_in)
> +    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
> +
> +  if (!stmt_vectype_in)
> +    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
>
>    if (slp_node)
>      {
>        ncopies = 1;
> +      stmt_ncopies = 1;
>        vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>      }
>    else
>      {
>        ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> +      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
> +      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
>        vec_num = 1;
>      }
>
> @@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> -  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> -
> +  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> +                                                   stmt_vectype_in);
>    /* Transform.  */
> -  tree new_temp = NULL_TREE;
> -  auto_vec<tree> vec_oprnds0;
> -  auto_vec<tree> vec_oprnds1;
> -  auto_vec<tree> vec_oprnds2;
> -  tree def0;
> +  auto_vec<tree> vec_oprnds[3];
>
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
> @@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
>      }
>
> -  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> -
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
> @@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -          code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +          code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
>            reduc_index, masks, lens);
>      }
>
> @@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>
> -  /* Get NCOPIES vector definitions for all operands except the reduction
> -     definition.  */
> -  if (!cond_fn_p)
> +  gcc_assert (reduc_index < 3);
> +
> +  if (slp_node)
>      {
> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> -                        single_defuse_cycle && reduc_index == 0
> -                        ? NULL_TREE : op.ops[0], &vec_oprnds0,
> -                        single_defuse_cycle && reduc_index == 1
> -                        ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -                        op.num_ops == 3
> -                        && !(single_defuse_cycle && reduc_index == 2)
> -                        ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);

I think that's going to fail.  Mind v3 of the series I posted to enable
SLP discovery for single-lane reductions.  Basically everything is
going to be SLP for GCC 15.

> +
> +      for (i = 0; i < (int) op.num_ops; i++)
> +       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
>      }
>    else
>      {
> -      /* For a conditional operation pass the truth type as mask
> -        vectype.  */
> -      gcc_assert (single_defuse_cycle
> -                 && (reduc_index == 1 || reduc_index == 2));
> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> -                        op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
> -                        reduc_index == 1 ? NULL_TREE : op.ops[1],
> -                        NULL_TREE, &vec_oprnds1,
> -                        reduc_index == 2 ? NULL_TREE : op.ops[2],
> -                        NULL_TREE, &vec_oprnds2);
> -    }
> +      /* The input vectype of the reduction PHI determines copies of
> +        vectorized def-use cycles, which might be more than effective copies
> +        of vectorized lane-reducing reduction statements.  This could be
> +        complemented by generating extra trivial pass-through copies.  For
> +        example:
> +

That also means you need to handle SLP here, but you can assert there's
only a single lane.

Btw, you can push the patches I approved if they independently test OK.

> +          int sum = 0;
> +          for (i)
> +            {
> +              sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> +              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
> +              sum += n[i];               // normal <vector(4) int>
> +            }
> +
> +        The vector size is 128-bit，vectorization factor is 16.  Reduction
> +        statements would be transformed as:
> +
> +          vector<4> int sum_v0 = { 0, 0, 0, 0 };
> +          vector<4> int sum_v1 = { 0, 0, 0, 0 };
> +          vector<4> int sum_v2 = { 0, 0, 0, 0 };
> +          vector<4> int sum_v3 = { 0, 0, 0, 0 };
> +
> +          for (i / 16)
> +            {
> +              sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> +              sum_v1 = sum_v1;  // copy
> +              sum_v2 = sum_v2;  // copy
> +              sum_v3 = sum_v3;  // copy
> +
> +              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> +              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> +              sum_v2 = sum_v2;  // copy
> +              sum_v3 = sum_v3;  // copy
> +
> +              sum_v0 += n_v0[i: 0  ~ 3 ];
> +              sum_v1 += n_v1[i: 4  ~ 7 ];
> +              sum_v2 += n_v2[i: 8  ~ 11];
> +              sum_v3 += n_v3[i: 12 ~ 15];
> +            }
> +       */
> +
> +      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
> +       {
> +         tree vectype = NULL_TREE;
> +         int used_ncopies = ncopies;
> +
> +         if (cond_fn_p && i == 0)
> +           {
> +             /* For a conditional operation pass the truth type as mask
> +                vectype.  */
> +             gcc_assert (single_defuse_cycle && reduc_index > 0);
> +             vectype = truth_type_for (vectype_in);
> +           }
>
> -  /* For single def-use cycles get one copy of the vectorized reduction
> -     definition.  */
> -  if (single_defuse_cycle)
> -    {
> -      gcc_assert (!slp_node);
> -      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> -                                    op.ops[reduc_index],
> -                                    reduc_index == 0 ? &vec_oprnds0
> -                                    : (reduc_index == 1 ? &vec_oprnds1
> -                                       : &vec_oprnds2));
> +         if (i != reduc_index)
> +           {
> +             /* For non-reduction operand, deduce effictive copies that are
> +                involved in vectorized def-use cycles based on the input
> +                vectype of the reduction statement.  */
> +             used_ncopies = stmt_ncopies;
> +           }
> +         else if (single_defuse_cycle)
> +           {
> +             /* For single def-use cycles get one copy of the vectorized
> +                reduction definition.  */
> +             used_ncopies = 1;
> +           }
> +
> +         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
> +                                        op.ops[i], &vec_oprnds[i], vectype);
> +
> +         if (used_ncopies < ncopies)
> +           vec_oprnds[i].safe_grow_cleared (ncopies);
> +       }
>      }
>
> +  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>    bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> +  tree def0;
>
> -  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
> +  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
>      {
>        gimple *new_stmt;
> -      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
> -      if (masked_loop_p && !mask_by_cond_expr)
> +      tree new_temp = NULL_TREE;
> +      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
> +
> +      if (!vop[0] || !vop[1])
> +       {
> +         tree reduc_vop = vec_oprnds[reduc_index][i];
> +
> +         /* Insert trivial copy if no need to generate vectorized
> +            statement.  */
> +         gcc_assert (reduc_vop && stmt_ncopies < ncopies);
> +
> +         new_stmt = gimple_build_assign (vec_dest, reduc_vop);
> +         new_temp = make_ssa_name (vec_dest, new_stmt);
> +         gimple_set_lhs (new_stmt, new_temp);
> +         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +       }
> +      else if (masked_loop_p && !mask_by_cond_expr)
>         {
> -         /* No conditional ifns have been defined for dot-product yet.  */
> -         gcc_assert (code != DOT_PROD_EXPR);
> +         /* No conditional ifns have been defined for dot-product and sad
> +            yet.  */
> +         gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
>
>           /* Make sure that the reduction accumulator is vop[0].  */
>           if (reduc_index == 1)
> @@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>               std::swap (vop[0], vop[1]);
>             }
>           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -                                         vec_num * ncopies, vectype_in, i);
> +                                         vec_num * stmt_ncopies,
> +                                         stmt_vectype_in, i);
>           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
>                                                     vop[0], vop[1], vop[0]);
>           new_temp = make_ssa_name (vec_dest, call);
> @@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>        else
>         {
>           if (op.num_ops >= 3)
> -           vop[2] = vec_oprnds2[i];
> +           vop[2] = vec_oprnds[2][i];
>
>           if (masked_loop_p && mask_by_cond_expr)
>             {
>               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -                                             vec_num * ncopies, vectype_in, i);
> +                                             vec_num * stmt_ncopies,
> +                                             stmt_vectype_in, i);
>               build_vect_cond_expr (code, vop, mask, gsi);
>             }
>
> @@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>
>        if (slp_node)
>         slp_node->push_vec_def (new_stmt);
> -      else if (single_defuse_cycle
> -              && i < ncopies - 1)
> -       {
> -         if (reduc_index == 0)
> -           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
> -         else if (reduc_index == 1)
> -           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
> -         else if (reduc_index == 2)
> -           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
> -       }
> +      else if (single_defuse_cycle && i < ncopies - 1)
> +       vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
>        else
>         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>      }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 2e0be763abb..cc0a832f71b 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo,
>                                       NULL, NULL, node, cost_vec)
>           || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
>           || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> +         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
> +                                        stmt_info, node, cost_vec)
>           || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
>                                      node, node_instance, cost_vec)
>           || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 97ec9c341e7..ca810869592 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
>  extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
>                                          slp_tree, slp_instance, int,
>                                          bool, stmt_vector_for_cost *);
> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
> +                                       slp_tree, stmt_vector_for_cost *);
>  extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
>                                     slp_tree, slp_instance,
>                                     stmt_vector_for_cost *);
> --
> 2.17.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]
  2024-05-31 14:57 ` Richard Biener
@ 2024-06-02 14:13   ` Feng Xue OS
  2024-06-04 13:17     ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Feng Xue OS @ 2024-06-02 14:13 UTC (permalink / raw)
  To: Richard Biener; +Cc: Tamar Christina, gcc-patches

Please see my comments below.

Thanks,
Feng

> On Thu, May 30, 2024 at 4:55 PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>>
>> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
>> vectorizer could only handle the pattern if the reduction chain does not
>> contain other operation, no matter the other is normal or lane-reducing.
>>
>> Actually, to allow multiple arbitray lane-reducing operations, we need to
>> support vectorization of loop reduction chain with mixed input vectypes. Since
>> lanes of vectype may vary with operation, the effective ncopies of vectorized
>> statements for operation also may not be same to each other, this causes
>> mismatch on vectorized def-use cycles. A simple way is to align all operations
>> with the one that has the most ncopies, the gap could be complemented by
>> generating extra trival pass-through copies. For example:
>>
>>    int sum = 0;
>>    for (i)
>>      {
>>        sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
>>        sum += w[i];               // widen-sum <vector(16) char>
>>        sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
>>        sum += n[i];               // normal <vector(4) int>
>>      }
>>
>> The vector size is 128-bit，vectorization factor is 16. Reduction statements
>> would be transformed as:
>>
>>    vector<4> int sum_v0 = { 0, 0, 0, 0 };
>>    vector<4> int sum_v1 = { 0, 0, 0, 0 };
>>    vector<4> int sum_v2 = { 0, 0, 0, 0 };
>>    vector<4> int sum_v3 = { 0, 0, 0, 0 };
>>
>>    for (i / 16)
>>      {
>>        sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
>>        sum_v1 = sum_v1;  // copy
>>        sum_v2 = sum_v2;  // copy
>>        sum_v3 = sum_v3;  // copy
>>
>>        sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
>>        sum_v1 = sum_v1;  // copy
>>        sum_v2 = sum_v2;  // copy
>>        sum_v3 = sum_v3;  // copy
>>
>>        sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
>>        sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
>>        sum_v2 = sum_v2;  // copy
>>        sum_v3 = sum_v3;  // copy
>>
>>        sum_v0 += n_v0[i: 0  ~ 3 ];
>>        sum_v1 += n_v1[i: 4  ~ 7 ];
>>        sum_v2 += n_v2[i: 8  ~ 11];
>>        sum_v3 += n_v3[i: 12 ~ 15];
>>      }
>>
>> Thanks,
>> Feng
>>
>> ...
>>
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index 20c99f11e9a..b5849dbb08a 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
>>      gcc_unreachable ();
>>
>> -  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
>> -
>>    if (reduction_type == EXTRACT_LAST_REDUCTION)
>>      /* No extra instructions are needed in the prologue.  The loop body
>>         operations are costed in vectorizable_condition.  */
>> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>>            initial result of the data reduction, initial value of the index
>>            reduction.  */
>>         prologue_stmts = 4;
>> -      else if (emulated_mixed_dot_prod)
>> -       /* We need the initial reduction value and two invariants:
>> -          one that contains the minimum signed value and one that
>> -          contains half of its negative.  */
>> -       prologue_stmts = 3;
>>        else
>> +       /* We need the initial reduction value.  */
>>         prologue_stmts = 1;
>>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
>>                                          scalar_to_vec, stmt_info, 0,
>> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
>>      }
>>  }
>>
>> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
>> +   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
>> +   Now there are three such kinds of operations: dot-prod/widen-sum/sad
>> +   (sum-of-absolute-differences).
>> +
>> +   For a lane-reducing operation, the loop reduction path that it lies in,
>> +   may contain normal operation, or other lane-reducing operation of different
>> +   input type size, an example as:
>> +
>> +     int sum = 0;
>> +     for (i)
>> +       {
>> +         ...
>> +         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
>> +         sum += w[i];                // widen-sum <vector(16) char>
>> +         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
>> +         sum += n[i];                // normal <vector(4) int>
>> +         ...
>> +       }
>> +
>> +   Vectorization factor is essentially determined by operation whose input
>> +   vectype has the most lanes ("vector(16) char" in the example), while we
>> +   need to choose input vectype with the least lanes ("vector(4) int" in the
>> +   example) for the reduction PHI statement.  */
>> +
>> +bool
>> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
>> +                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
>> +{
>> +  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
>> +  if (!stmt)
>> +    return false;
>> +
>> +  enum tree_code code = gimple_assign_rhs_code (stmt);
>> +
>> +  if (!lane_reducing_op_p (code))
>> +    return false;
> 
> Can you make sure to return false if STMT_VINFO_REDUC_IDX == -1
> thus the op is not part of a reduction chain/path?
>

As I planed, in the 2nd stage patches WIP, this function will also handle
lane-reducing operation that does not directly participate reduction, like:

 temp = dot_prod1 + dot_prod2;
 sum += temp;

In this case, STMT_VINFO_REDUC_IDX of dot_prod1/2 == -1

For current work, the check is needed to filter out non-reduction statement,
but since it is expected to be removed later, so the check is placed at a late
point.
 
>> +  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
>> +
>> +  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
>> +    return false;
>> +
>> +  /* Do not try to vectorize bit-precision reductions.  */
>> +  if (!type_has_mode_precision_p (type))
>> +    return false;
>> +
>> +  tree vectype_in = NULL_TREE;
>> +
>> +  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
>> +    {
>> +      stmt_vec_info def_stmt_info;
>> +      slp_tree slp_op;
>> +      tree op;
>> +      tree vectype;
>> +      enum vect_def_type dt;
>> +
>> +      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
>> +                              &slp_op, &dt, &vectype, &def_stmt_info))
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                            "use not simple.\n");
>> +         return false;
>> +       }
>> +
>> +      if (!vectype)
>> +       {
>> +         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
>> +                                                slp_op);
>> +         if (!vectype)
>> +           return false;
>> +       }
>> +
>> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
> 
> Please avoid this during transform.

This function is only for analysis not transform.
 
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                            "incompatible vector types for invariants\n");
>> +         return false;
>> +       }
>> +
>> +      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>> +       continue;
>> +
>> +      /* There should be at most one cycle def in the stmt.  */
>> +      if (VECTORIZABLE_CYCLE_DEF (dt))
>> +       return false;
>> +
>> +      /* To properly compute ncopies we are interested in the widest
>> +        non-reduction input type in case we're looking at a widening
>> +        accumulation that we later handle in vect transformation.  */
>> +      if (!vectype_in
>> +         || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
>> +             < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
>> +       vectype_in = vectype;
>> +    }
>> +
>> +  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> 
> As said below I wonder where we would need STMT_VINFO_REDUC_VECTYPE_IN.
> At least you should avoid re-setting this when !cost_vec aka during transform,
> possibly instead asserting you re-compute the same type (or simply
> skip the above
> loop and set vectype_in from STMT_VINFO_REDUC_VECTYPE_IN which then
> gets a good use).

Likewise.

> 
>> +  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
>> +
>> +  /* TODO: Support lane-reducing operation that does not directly participate
>> +     in loop reduction. */
>> +  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
>> +    return false;
>> +
>> +  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
>> +     recoginized.  */
>> +  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
>> +  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
>> +
>> +  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
>> +
>> +  /* To accommodate lane-reducing operations of mixed input vectypes, choose
>> +     input vectype with the least lanes for the reduction PHI statement, which
>> +     would result in the most ncopies for vectorized reduction results.  */
>> +  if (!vphi_vectype_in
>> +      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
>> +         > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
>> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> 
> Likewise.
> 
>> +  int ncopies_for_cost;
>> +
>> +  if (slp_node)
>> +    {
>> +      /* Now lane-reducing operations in a slp node should only come from
>> +        the same loop reduction path.  */
>> +      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
>> +      ncopies_for_cost = 1;
>> +    }
>> +  else
>> +    {
>> +      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
>> +      gcc_assert (ncopies_for_cost >= 1);
>> +    }
>> +
>> +  if (vect_is_emulated_mixed_dot_prod (stmt_info))
>> +    {
>> +      /* We need extra two invariants: one that contains the minimum signed
>> +        value and one that contains half of its negative.  */
>> +      int prologue_stmts = 2;
>> +      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
>> +                                       scalar_to_vec, stmt_info, 0,
>> +                                       vect_prologue);
>> +      if (dump_enabled_p ())
>> +       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
>> +                    "extra prologue_cost = %d .\n", cost);
>> +
>> +      /* Three dot-products and a subtraction.  */
>> +      ncopies_for_cost *= 4;
>> +    }
>> +
>> +  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
>> +                   vect_body);
>> +
>> +  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
>> +                                    type, vectype_in);
>> +
>> +  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
> 
> Uh, so those all go through vect_transform_reduction.  I see.
> 
> I fail to see a check for whether the target supports the lane-reducing op.
> vectorizable_reduction only checks the last one.  Currently the check
> might be redundant with what pattern recognition checks but it's still
> incomplete compared to the check in vectorizable_reduction.

In the original vectorizable_reduction, the target support check is deliberately
skipped for lane-reducing operations. The reason is part as you said, moreover,
other check would always not be executed.

  if (single_defuse_cycle || lane_reduc_code_p)
    {
      gcc_assert (op.code != COND_EXPR);

      /* 4. Supportable by target?  */
      bool ok = true;

      /* 4.1. check support for the operation in the loop

	 This isn't necessary for the lane reduction codes, since they
	 can only be produced by pattern matching, and it's up to the
	 pattern matcher to test for support.  The main reason for
	 specifically skipping this step is to avoid rechecking whether
	 mixed-sign dot-products can be implemented using signed
	 dot-products.  */
      machine_mode vec_mode = TYPE_MODE (vectype_in);
      if (!lane_reduc_code_p                              //<----------- skip
	  && !directly_supported_p (op.code, vectype_in, optab_vector))
        {
          if (dump_enabled_p ())
            dump_printf (MSG_NOTE, "op not supported by target.\n");
	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
	      || !vect_can_vectorize_without_simd_p (op.code))
	    ok = false;
	  else
	    if (dump_enabled_p ())
	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
        }

      // <----- always false for lane-reducing op

      if (vect_emulated_vector_p (vectype_in)
	  && !vect_can_vectorize_without_simd_p (op.code))
	{
	  if (dump_enabled_p ())
	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
	  return false;
	}

> 
>> +  return true;
>> +}
>> +
>>  /* Function vectorizable_reduction.
>>
>>     Check if STMT_INFO performs a reduction operation that can be vectorized.
>> @@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>                                (gimple_bb (reduc_def_phi)->loop_father));
>>    unsigned reduc_chain_length = 0;
>>    bool only_slp_reduc_chain = true;
>> +  bool only_lane_reducing = true;
>>    stmt_info = NULL;
>>    slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
>>    while (reduc_def != PHI_RESULT (reduc_def_phi))
>> @@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>               return false;
>>             }
>>         }
>> -      else if (!stmt_info)
>> -       /* First non-conversion stmt.  */
>> -       stmt_info = vdef;
>> +      else
>> +       {
>> +         /* First non-conversion stmt.  */
>> +         if (!stmt_info)
>> +           stmt_info = vdef;
>> +
>> +         if (!lane_reducing_op_p (op.code))
>> +           only_lane_reducing = false;
>> +       }
>> +
>>        reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
>>        reduc_chain_length++;
>>        if (!stmt_info && slp_node)
>> @@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>    if (!type_has_mode_precision_p (op.type))
>>      return false;
>>
>> -  /* For lane-reducing ops we're reducing the number of reduction PHIs
>> -     which means the only use of that may be in the lane-reducing operation.  */
>> -  if (lane_reducing
>> -      && reduc_chain_length != 1
>> -      && !only_slp_reduc_chain)
>> -    {
>> -      if (dump_enabled_p ())
>> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -                        "lane-reducing reduction with extra stmts.\n");
>> -      return false;
>> -    }
>> -
>>    /* Lane-reducing ops also never can be used in a SLP reduction group
>>       since we'll mix lanes belonging to different reductions.  But it's
>>       OK to use them in a reduction chain or when the reduction group
>> @@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>                              "use not simple.\n");
>>           return false;
>>         }
>> -      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>> -       continue;
>> -
> 
> So within this loop we analyze the "main" operation, while I do not exactly
> remember why we skip the op leading to the PHI I don't understand why you
> want to look at it for the multi lane-reducing case (the accumulator
> always has the same type, no?).
> 
> In any case this just looks at a single (the last) lane-reducing or even
> not lane-reducing op.
> 

This comparison is redundant, since it could be covered by the following 
comparison statement. The change should have been placed to a separate
patch, but for convenience I made it here.

      /* For an IFN_COND_OP we might hit the reduction definition operand
	 twice (once as definition, once as else).  */
      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
	continue;

      /* There should be only one cycle def in the stmt, the one
	 leading to reduc_def.  */
      if (VECTORIZABLE_CYCLE_DEF (dt))
	return false;

>>        /* For an IFN_COND_OP we might hit the reduction definition operand
>>          twice (once as definition, once as else).  */
>>        if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
>> @@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>      }
>>    if (!vectype_in)
>>      vectype_in = STMT_VINFO_VECTYPE (phi_info);
>> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
>>
>> -  /* Each lane-reducing operation has its own input vectype, while reduction
>> -     PHI records the input vectype with least lanes.  */
>> -  if (lane_reducing)
>> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
>> -
>> -  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
>> -  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
>> +  /* If there is a normal (non-lane-reducing) operation in the loop reduction
>> +     path, to ensure there will be enough copies to hold vectorized results of
>> +     the operation, we need set the input vectype of the reduction PHI to be
>> +     same as the reduction output vectype somewhere, here is a suitable place.
>> +     Otherwise the input vectype is set to the one with the least lanes, which
>> +     can only be determined in vectorizable analysis routine of lane-reducing
>> +     operation.  */
> 
> But we are using vectype_in to compute ncopies which is used in cost analysis.

The vectype_in only impacts the cost analysis for lane-reducing op, since the
function vect_is_emulated_mixed_dot_prod need it, and this function is referred
by cost analysis. In the previous patch, we bind the vectype_in to each
lane-reducing op and also adjust code of the function accordingly, then this
would not be a problem.

> You say this might not be the final ncopies?  Note the vectorization factor is
> already fixed as well as (output) vector types of the lane-reducing ops.  So

The vectype_in is incrementally updated during analyzing vectorizablility of
lane-reducing ops. So before transform, the type should be determined.

> shouldn't we simply pick that up in the loop walking the use-def chain via
> REDUC_IDX at the start of this function? 

I thought about doing it in that way. Ok. will consider it again.

> I'm unsure as to why we need
> STMT_VINFO_REDUC_VECTYPE_IN at all (I don't remember adding that),
> it should be readily available from operand analysis.  The docs for that
> isn't very enlightening either (there's also REDUC_VECTYPE, in addition
> to VECTYPE - huh).

For old code, in which only one lane-reducing op is allowed in loop
reduction, this type might be computed on-demand.

But for multiple lane-reducing ops, we need to know the vectype_in types
of all ops in order to determine a proper vectype_in for PHI statement, if
traversing those ops and computing types on-demand would not a good
way.  Additionally, during transform, originally cfg flow is broken and could
not be used.

>> +  if (!only_lane_reducing)
>> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
>> +
>> +  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
>> +  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
>>    /* If we have a condition reduction, see if we can simplify it further.  */
>> -  if (v_reduc_type == COND_REDUCTION)
>> +  if (reduction_type == COND_REDUCTION)
>>      {
>>        if (slp_node)
>>         return false;
>> @@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>      }
>>
>>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>> +  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>>
>> -  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>>    if (reduction_type == TREE_CODE_REDUCTION)
>>      {
>>        /* Check whether it's ok to change the order of the computation.
>> @@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>        && loop_vinfo->suggested_unroll_factor == 1)
>>      single_defuse_cycle = true;
>>
>> -  if (single_defuse_cycle || lane_reducing)
>> +  if (single_defuse_cycle && !lane_reducing)
>>      {
>>        gcc_assert (op.code != COND_EXPR);
>>
>> -      /* 4. Supportable by target?  */
>> -      bool ok = true;
>> -
>> -      /* 4.1. check support for the operation in the loop
>> +      /* 4. check support for the operation in the loop
>>
>>          This isn't necessary for the lane reduction codes, since they
>>          can only be produced by pattern matching, and it's up to the
>> @@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>          mixed-sign dot-products can be implemented using signed
>>          dot-products.  */
>>        machine_mode vec_mode = TYPE_MODE (vectype_in);
>> -      if (!lane_reducing
>> -         && !directly_supported_p (op.code, vectype_in, optab_vector))
>> +      if (!directly_supported_p (op.code, vectype_in, optab_vector))
>>          {
>>            if (dump_enabled_p ())
>>              dump_printf (MSG_NOTE, "op not supported by target.\n");
>>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
>>               || !vect_can_vectorize_without_simd_p (op.code))
>> -           ok = false;
>> +           single_defuse_cycle = false;
>>           else
>>             if (dump_enabled_p ())
>>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
>> @@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
>>           return false;
>>         }
>> -
>> -      /* lane-reducing operations have to go through vect_transform_reduction.
>> -         For the other cases try without the single cycle optimization.  */
>> -      if (!ok)
>> -       {
>> -         if (lane_reducing)
>> -           return false;
>> -         else
>> -           single_defuse_cycle = false;
>> -       }
>>      }
>>    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
>>
>> -  /* If the reduction stmt is one of the patterns that have lane
>> -     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
>> -  if ((ncopies > 1 && ! single_defuse_cycle)
>> -      && lane_reducing)
>> -    {
>> -      if (dump_enabled_p ())
>> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -                        "multi def-use cycle not possible for lane-reducing "
>> -                        "reduction operation\n");
>> -      return false;
>> -    }
>> -
>> -  if (slp_node
>> -      && !(!single_defuse_cycle
>> -          && !lane_reducing
>> -          && reduction_type != FOLD_LEFT_REDUCTION))
>> +  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
>> +     below processing will be done in its own vectorizable function.  */
>> +  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
>>      for (i = 0; i < (int) op.num_ops; i++)
>>        if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
>>         {
>> @@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>    vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
>>                              reduction_type, ncopies, cost_vec);
>>    /* Cost the reduction op inside the loop if transformed via
>> -     vect_transform_reduction.  Otherwise this is costed by the
>> -     separate vectorizable_* routines.  */
>> -  if (single_defuse_cycle || lane_reducing)
>> -    {
>> -      int factor = 1;
>> -      if (vect_is_emulated_mixed_dot_prod (stmt_info))
>> -       /* Three dot-products and a subtraction.  */
>> -       factor = 4;
>> -      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
>> -                       stmt_info, 0, vect_body);
>> -    }
>> +     vect_transform_reduction for non-lane-reducing operation.  Otherwise
>> +     this is costed by the separate vectorizable_* routines.  */
>> +  if (single_defuse_cycle && !lane_reducing)
>> +    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
>>
>>    if (dump_enabled_p ()
>>        && reduction_type == FOLD_LEFT_REDUCTION)
>>      dump_printf_loc (MSG_NOTE, vect_location,
>>                      "using an in-order (fold-left) reduction.\n");
>>    STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
>> -  /* All but single defuse-cycle optimized, lane-reducing and fold-left
>> -     reductions go through their own vectorizable_* routines.  */
>> -  if (!single_defuse_cycle
>> -      && !lane_reducing
>> -      && reduction_type != FOLD_LEFT_REDUCTION)
>> +
>> +  /* All but single defuse-cycle optimized and fold-left reductions go
>> +     through their own vectorizable_* routines.  */
>> +  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
>> +      || lane_reducing)

> 
> So single-def-use-cycle but lane-reducing ops no longer need
> to go through vect_transform_reduction?  How do you handle those
> but fail to handle non-lane-reducing ops this way?

Emm, all kinds of lane-reducing ops will go into vectorizable_lane_reducing(),
no matter it is single-def-use or not, at that function, the STMT_VINFO_TYPE
is set to reduc_vec_info_type, so transform will be done inside
vect_transform_reduction.

> 
>>      {
>>        stmt_vec_info tem
>>         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
>> @@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>>    int i;
>>    int ncopies;
>> +  int stmt_ncopies;
>>    int vec_num;
>>
>>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
>> @@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>    gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
>>    int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
>>    tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
>> +  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
>> +
>> +  /* Get input vectypes from the reduction PHI and the statement to be
>> +     transformed, these two vectypes may have different lanes when
>> +     lane-reducing operation is present.  */
>> +  if (!vectype_in)
>> +    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
>> +
>> +  if (!stmt_vectype_in)
>> +    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
>>
>>    if (slp_node)
>>      {
>>        ncopies = 1;
>> +      stmt_ncopies = 1;
>>        vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>>      }
>>    else
>>      {
>>        ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
>> +      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
>> +      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
>>        vec_num = 1;
>>      }
>>
>> @@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>
>>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>> -  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
>> -
>> +  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
>> +                                                   stmt_vectype_in);
>>    /* Transform.  */
>> -  tree new_temp = NULL_TREE;
>> -  auto_vec<tree> vec_oprnds0;
>> -  auto_vec<tree> vec_oprnds1;
>> -  auto_vec<tree> vec_oprnds2;
>> -  tree def0;
>> +  auto_vec<tree> vec_oprnds[3];
>>
>>    if (dump_enabled_p ())
>>      dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
>> @@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
>>      }
>>
>> -  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>> -
>>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>>    if (reduction_type == FOLD_LEFT_REDUCTION)
>>      {
>> @@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>        gcc_assert (code.is_tree_code () || cond_fn_p);
>>        return vectorize_fold_left_reduction
>>           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
>> -          code, reduc_fn, op.ops, op.num_ops, vectype_in,
>> +          code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
>>            reduc_index, masks, lens);
>>      }
>>
>> @@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>>
>> -  /* Get NCOPIES vector definitions for all operands except the reduction
>> -     definition.  */
>> -  if (!cond_fn_p)
>> +  gcc_assert (reduc_index < 3);
>> +
>> +  if (slp_node)
>>      {
>> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>> -                        single_defuse_cycle && reduc_index == 0
>> -                        ? NULL_TREE : op.ops[0], &vec_oprnds0,
>> -                        single_defuse_cycle && reduc_index == 1
>> -                        ? NULL_TREE : op.ops[1], &vec_oprnds1,
>> -                        op.num_ops == 3
>> -                        && !(single_defuse_cycle && reduc_index == 2)
>> -                        ? op.ops[2] : NULL_TREE, &vec_oprnds2);
>> +      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
> 
> I think that's going to fail.  Mind v3 of the series I posted to enable
> SLP discovery for single-lane reductions.  Basically everything is
> going to be SLP for GCC 15.
> 

Have the v3 already been in the trunk? Then by default, any statement that has
no isomorphic partner will become a single-lane SLP node?  And for such node,
can I just reuse the old non-SLP transformation code?

>> +
>> +      for (i = 0; i < (int) op.num_ops; i++)
>> +       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
>>      }
>>    else
>>      {
>> -      /* For a conditional operation pass the truth type as mask
>> -        vectype.  */
>> -      gcc_assert (single_defuse_cycle
>> -                 && (reduc_index == 1 || reduc_index == 2));
>> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>> -                        op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
>> -                        reduc_index == 1 ? NULL_TREE : op.ops[1],
>> -                        NULL_TREE, &vec_oprnds1,
>> -                        reduc_index == 2 ? NULL_TREE : op.ops[2],
>> -                        NULL_TREE, &vec_oprnds2);
>> -    }
>> +      /* The input vectype of the reduction PHI determines copies of
>> +        vectorized def-use cycles, which might be more than effective copies
>> +        of vectorized lane-reducing reduction statements.  This could be
>> +        complemented by generating extra trivial pass-through copies.  For
>> +        example:
>> +
> 
> That also means you need to handle SLP here, but you can assert there's
> only a single lane.
> 
> Btw, you can push the patches I approved if they independently test OK.
> 

>> +          int sum = 0;
>> +          for (i)
>> +            {
>> +              sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
>> +              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
>> +              sum += n[i];               // normal <vector(4) int>
>> +            }
>> +
>> +        The vector size is 128-bit，vectorization factor is 16.  Reduction
>> +        statements would be transformed as:
>> +
>> +          vector<4> int sum_v0 = { 0, 0, 0, 0 };
>> +          vector<4> int sum_v1 = { 0, 0, 0, 0 };
>> +          vector<4> int sum_v2 = { 0, 0, 0, 0 };
>> +          vector<4> int sum_v3 = { 0, 0, 0, 0 };
>> +
>> +          for (i / 16)
>> +            {
>> +              sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
>> +              sum_v1 = sum_v1;  // copy
>> +              sum_v2 = sum_v2;  // copy
>> +              sum_v3 = sum_v3;  // copy
>> +
>> +              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
>> +              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
>> +              sum_v2 = sum_v2;  // copy
>> +              sum_v3 = sum_v3;  // copy
>> +
>> +              sum_v0 += n_v0[i: 0  ~ 3 ];
>> +              sum_v1 += n_v1[i: 4  ~ 7 ];
>> +              sum_v2 += n_v2[i: 8  ~ 11];
>> +              sum_v3 += n_v3[i: 12 ~ 15];
>> +            }
>> +       */
>> +
>> +      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
>> +       {
>> +         tree vectype = NULL_TREE;
>> +         int used_ncopies = ncopies;
>> +
>> +         if (cond_fn_p && i == 0)
>> +           {
>> +             /* For a conditional operation pass the truth type as mask
>> +                vectype.  */
>> +             gcc_assert (single_defuse_cycle && reduc_index > 0);
>> +             vectype = truth_type_for (vectype_in);
>> +           }
>>
>> -  /* For single def-use cycles get one copy of the vectorized reduction
>> -     definition.  */
>> -  if (single_defuse_cycle)
>> -    {
>> -      gcc_assert (!slp_node);
>> -      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>> -                                    op.ops[reduc_index],
>> -                                    reduc_index == 0 ? &vec_oprnds0
>> -                                    : (reduc_index == 1 ? &vec_oprnds1
>> -                                       : &vec_oprnds2));
>> +         if (i != reduc_index)
>> +           {
>> +             /* For non-reduction operand, deduce effictive copies that are
>> +                involved in vectorized def-use cycles based on the input
>> +                vectype of the reduction statement.  */
>> +             used_ncopies = stmt_ncopies;
>> +           }
>> +         else if (single_defuse_cycle)
>> +           {
>> +             /* For single def-use cycles get one copy of the vectorized
>> +                reduction definition.  */
>> +             used_ncopies = 1;
>> +           }
>> +
>> +         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
>> +                                        op.ops[i], &vec_oprnds[i], vectype);
>> +
>> +         if (used_ncopies < ncopies)
>> +           vec_oprnds[i].safe_grow_cleared (ncopies);
>> +       }
>>      }
>>
>> +  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>>    bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
>> +  tree def0;
>>
>> -  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
>> +  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
>>      {
>>        gimple *new_stmt;
>> -      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
>> -      if (masked_loop_p && !mask_by_cond_expr)
>> +      tree new_temp = NULL_TREE;
>> +      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
>> +
>> +      if (!vop[0] || !vop[1])
>> +       {
>> +         tree reduc_vop = vec_oprnds[reduc_index][i];
>> +
>> +         /* Insert trivial copy if no need to generate vectorized
>> +            statement.  */
>> +         gcc_assert (reduc_vop && stmt_ncopies < ncopies);
>> +
>> +         new_stmt = gimple_build_assign (vec_dest, reduc_vop);
>> +         new_temp = make_ssa_name (vec_dest, new_stmt);
>> +         gimple_set_lhs (new_stmt, new_temp);
>> +         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
>> +       }
>> +      else if (masked_loop_p && !mask_by_cond_expr)
>>         {
>> -         /* No conditional ifns have been defined for dot-product yet.  */
>> -         gcc_assert (code != DOT_PROD_EXPR);
>> +         /* No conditional ifns have been defined for dot-product and sad
>> +            yet.  */
>> +         gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
>>
>>           /* Make sure that the reduction accumulator is vop[0].  */
>>           if (reduc_index == 1)
>> @@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>               std::swap (vop[0], vop[1]);
>>             }
>>           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
>> -                                         vec_num * ncopies, vectype_in, i);
>> +                                         vec_num * stmt_ncopies,
>> +                                         stmt_vectype_in, i);
>>           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
>>                                                     vop[0], vop[1], vop[0]);
>>           new_temp = make_ssa_name (vec_dest, call);
>> @@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>        else
>>         {
>>           if (op.num_ops >= 3)
>> -           vop[2] = vec_oprnds2[i];
>> +           vop[2] = vec_oprnds[2][i];
>>
>>           if (masked_loop_p && mask_by_cond_expr)
>>             {
>>               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
>> -                                             vec_num * ncopies, vectype_in, i);
>> +                                             vec_num * stmt_ncopies,
>> +                                             stmt_vectype_in, i);
>>               build_vect_cond_expr (code, vop, mask, gsi);
>>             }
>>
>> @@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>
>>        if (slp_node)
>>         slp_node->push_vec_def (new_stmt);
>> -      else if (single_defuse_cycle
>> -              && i < ncopies - 1)
>> -       {
>> -         if (reduc_index == 0)
>> -           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
>> -         else if (reduc_index == 1)
>> -           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
>> -         else if (reduc_index == 2)
>> -           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
>> -       }
>> +      else if (single_defuse_cycle && i < ncopies - 1)
>> +       vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
>>        else
>>         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>      }
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index 2e0be763abb..cc0a832f71b 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo,
>>                                       NULL, NULL, node, cost_vec)
>>           || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
>>           || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
>> +         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
>> +                                        stmt_info, node, cost_vec)
>>           || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
>>                                      node, node_instance, cost_vec)
>>           || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
>> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> index 97ec9c341e7..ca810869592 100644
>> --- a/gcc/tree-vectorizer.h
>> +++ b/gcc/tree-vectorizer.h
>> @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
>>  extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
>>                                          slp_tree, slp_instance, int,
>>                                          bool, stmt_vector_for_cost *);
>> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
>> +                                       slp_tree, stmt_vector_for_cost *);
>>  extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
>>                                     slp_tree, slp_instance,
>>                                     stmt_vector_for_cost *);
>> --
>> 2.17.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]
  2024-06-02 14:13   ` Feng Xue OS
@ 2024-06-04 13:17     ` Richard Biener
  2024-06-14  4:00       ` Feng Xue OS
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2024-06-04 13:17 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: Tamar Christina, gcc-patches

On Sun, Jun 2, 2024 at 4:13 PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> Please see my comments below.
>
> Thanks,
> Feng
>
> > On Thu, May 30, 2024 at 4:55 PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
> >>
> >> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
> >> vectorizer could only handle the pattern if the reduction chain does not
> >> contain other operation, no matter the other is normal or lane-reducing.
> >>
> >> Actually, to allow multiple arbitray lane-reducing operations, we need to
> >> support vectorization of loop reduction chain with mixed input vectypes. Since
> >> lanes of vectype may vary with operation, the effective ncopies of vectorized
> >> statements for operation also may not be same to each other, this causes
> >> mismatch on vectorized def-use cycles. A simple way is to align all operations
> >> with the one that has the most ncopies, the gap could be complemented by
> >> generating extra trival pass-through copies. For example:
> >>
> >>    int sum = 0;
> >>    for (i)
> >>      {
> >>        sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> >>        sum += w[i];               // widen-sum <vector(16) char>
> >>        sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
> >>        sum += n[i];               // normal <vector(4) int>
> >>      }
> >>
> >> The vector size is 128-bit，vectorization factor is 16. Reduction statements
> >> would be transformed as:
> >>
> >>    vector<4> int sum_v0 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v1 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v2 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v3 = { 0, 0, 0, 0 };
> >>
> >>    for (i / 16)
> >>      {
> >>        sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> >>        sum_v1 = sum_v1;  // copy
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
> >>        sum_v1 = sum_v1;  // copy
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> >>        sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 += n_v0[i: 0  ~ 3 ];
> >>        sum_v1 += n_v1[i: 4  ~ 7 ];
> >>        sum_v2 += n_v2[i: 8  ~ 11];
> >>        sum_v3 += n_v3[i: 12 ~ 15];
> >>      }
> >>
> >> Thanks,
> >> Feng
> >>
> >> ...
> >>
> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >> index 20c99f11e9a..b5849dbb08a 100644
> >> --- a/gcc/tree-vect-loop.cc
> >> +++ b/gcc/tree-vect-loop.cc
> >> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
> >>      gcc_unreachable ();
> >>
> >> -  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> >> -
> >>    if (reduction_type == EXTRACT_LAST_REDUCTION)
> >>      /* No extra instructions are needed in the prologue.  The loop body
> >>         operations are costed in vectorizable_condition.  */
> >> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>            initial result of the data reduction, initial value of the index
> >>            reduction.  */
> >>         prologue_stmts = 4;
> >> -      else if (emulated_mixed_dot_prod)
> >> -       /* We need the initial reduction value and two invariants:
> >> -          one that contains the minimum signed value and one that
> >> -          contains half of its negative.  */
> >> -       prologue_stmts = 3;
> >>        else
> >> +       /* We need the initial reduction value.  */
> >>         prologue_stmts = 1;
> >>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
> >>                                          scalar_to_vec, stmt_info, 0,
> >> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
> >>      }
> >>  }
> >>
> >> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
> >> +   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
> >> +   Now there are three such kinds of operations: dot-prod/widen-sum/sad
> >> +   (sum-of-absolute-differences).
> >> +
> >> +   For a lane-reducing operation, the loop reduction path that it lies in,
> >> +   may contain normal operation, or other lane-reducing operation of different
> >> +   input type size, an example as:
> >> +
> >> +     int sum = 0;
> >> +     for (i)
> >> +       {
> >> +         ...
> >> +         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
> >> +         sum += w[i];                // widen-sum <vector(16) char>
> >> +         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
> >> +         sum += n[i];                // normal <vector(4) int>
> >> +         ...
> >> +       }
> >> +
> >> +   Vectorization factor is essentially determined by operation whose input
> >> +   vectype has the most lanes ("vector(16) char" in the example), while we
> >> +   need to choose input vectype with the least lanes ("vector(4) int" in the
> >> +   example) for the reduction PHI statement.  */
> >> +
> >> +bool
> >> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
> >> +                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> >> +{
> >> +  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
> >> +  if (!stmt)
> >> +    return false;
> >> +
> >> +  enum tree_code code = gimple_assign_rhs_code (stmt);
> >> +
> >> +  if (!lane_reducing_op_p (code))
> >> +    return false;
> >
> > Can you make sure to return false if STMT_VINFO_REDUC_IDX == -1
> > thus the op is not part of a reduction chain/path?
> >
>
> As I planed, in the 2nd stage patches WIP, this function will also handle
> lane-reducing operation that does not directly participate reduction, like:
>
>  temp = dot_prod1 + dot_prod2;
>  sum += temp;
>
> In this case, STMT_VINFO_REDUC_IDX of dot_prod1/2 == -1
>
> For current work, the check is needed to filter out non-reduction statement,
> but since it is expected to be removed later, so the check is placed at a late
> point.
>
> >> +  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
> >> +
> >> +  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
> >> +    return false;
> >> +
> >> +  /* Do not try to vectorize bit-precision reductions.  */
> >> +  if (!type_has_mode_precision_p (type))
> >> +    return false;
> >> +
> >> +  tree vectype_in = NULL_TREE;
> >> +
> >> +  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
> >> +    {
> >> +      stmt_vec_info def_stmt_info;
> >> +      slp_tree slp_op;
> >> +      tree op;
> >> +      tree vectype;
> >> +      enum vect_def_type dt;
> >> +
> >> +      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
> >> +                              &slp_op, &dt, &vectype, &def_stmt_info))
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> +                            "use not simple.\n");
> >> +         return false;
> >> +       }
> >> +
> >> +      if (!vectype)
> >> +       {
> >> +         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
> >> +                                                slp_op);
> >> +         if (!vectype)
> >> +           return false;
> >> +       }
> >> +
> >> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
> >
> > Please avoid this during transform.
>
> This function is only for analysis not transform.
>
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> +                            "incompatible vector types for invariants\n");
> >> +         return false;
> >> +       }
> >> +
> >> +      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> >> +       continue;
> >> +
> >> +      /* There should be at most one cycle def in the stmt.  */
> >> +      if (VECTORIZABLE_CYCLE_DEF (dt))
> >> +       return false;
> >> +
> >> +      /* To properly compute ncopies we are interested in the widest
> >> +        non-reduction input type in case we're looking at a widening
> >> +        accumulation that we later handle in vect transformation.  */
> >> +      if (!vectype_in
> >> +         || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> >> +             < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
> >> +       vectype_in = vectype;
> >> +    }
> >> +
> >> +  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >
> > As said below I wonder where we would need STMT_VINFO_REDUC_VECTYPE_IN.
> > At least you should avoid re-setting this when !cost_vec aka during transform,
> > possibly instead asserting you re-compute the same type (or simply
> > skip the above
> > loop and set vectype_in from STMT_VINFO_REDUC_VECTYPE_IN which then
> > gets a good use).
>
> Likewise.
>
> >
> >> +  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
> >> +
> >> +  /* TODO: Support lane-reducing operation that does not directly participate
> >> +     in loop reduction. */
> >> +  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
> >> +    return false;
> >> +
> >> +  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
> >> +     recoginized.  */
> >> +  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
> >> +  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
> >> +
> >> +  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> >> +
> >> +  /* To accommodate lane-reducing operations of mixed input vectypes, choose
> >> +     input vectype with the least lanes for the reduction PHI statement, which
> >> +     would result in the most ncopies for vectorized reduction results.  */
> >> +  if (!vphi_vectype_in
> >> +      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> >> +         > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
> >> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> >
> > Likewise.
> >
> >> +  int ncopies_for_cost;
> >> +
> >> +  if (slp_node)
> >> +    {
> >> +      /* Now lane-reducing operations in a slp node should only come from
> >> +        the same loop reduction path.  */
> >> +      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
> >> +      ncopies_for_cost = 1;
> >> +    }
> >> +  else
> >> +    {
> >> +      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
> >> +      gcc_assert (ncopies_for_cost >= 1);
> >> +    }
> >> +
> >> +  if (vect_is_emulated_mixed_dot_prod (stmt_info))
> >> +    {
> >> +      /* We need extra two invariants: one that contains the minimum signed
> >> +        value and one that contains half of its negative.  */
> >> +      int prologue_stmts = 2;
> >> +      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
> >> +                                       scalar_to_vec, stmt_info, 0,
> >> +                                       vect_prologue);
> >> +      if (dump_enabled_p ())
> >> +       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
> >> +                    "extra prologue_cost = %d .\n", cost);
> >> +
> >> +      /* Three dot-products and a subtraction.  */
> >> +      ncopies_for_cost *= 4;
> >> +    }
> >> +
> >> +  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
> >> +                   vect_body);
> >> +
> >> +  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
> >> +                                    type, vectype_in);
> >> +
> >> +  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
> >
> > Uh, so those all go through vect_transform_reduction.  I see.
> >
> > I fail to see a check for whether the target supports the lane-reducing op.
> > vectorizable_reduction only checks the last one.  Currently the check
> > might be redundant with what pattern recognition checks but it's still
> > incomplete compared to the check in vectorizable_reduction.
>
> In the original vectorizable_reduction, the target support check is deliberately
> skipped for lane-reducing operations. The reason is part as you said, moreover,
> other check would always not be executed.
>
>   if (single_defuse_cycle || lane_reduc_code_p)
>     {
>       gcc_assert (op.code != COND_EXPR);
>
>       /* 4. Supportable by target?  */
>       bool ok = true;
>
>       /* 4.1. check support for the operation in the loop
>
>          This isn't necessary for the lane reduction codes, since they
>          can only be produced by pattern matching, and it's up to the
>          pattern matcher to test for support.  The main reason for
>          specifically skipping this step is to avoid rechecking whether
>          mixed-sign dot-products can be implemented using signed
>          dot-products.  */
>       machine_mode vec_mode = TYPE_MODE (vectype_in);
>       if (!lane_reduc_code_p                              //<----------- skip
>           && !directly_supported_p (op.code, vectype_in, optab_vector))
>         {
>           if (dump_enabled_p ())
>             dump_printf (MSG_NOTE, "op not supported by target.\n");
>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
>               || !vect_can_vectorize_without_simd_p (op.code))
>             ok = false;
>           else
>             if (dump_enabled_p ())
>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
>         }
>
>       // <----- always false for lane-reducing op
>
>       if (vect_emulated_vector_p (vectype_in)
>           && !vect_can_vectorize_without_simd_p (op.code))
>         {
>           if (dump_enabled_p ())
>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
>           return false;
>         }
>
> >
> >> +  return true;
> >> +}
> >> +
> >>  /* Function vectorizable_reduction.
> >>
> >>     Check if STMT_INFO performs a reduction operation that can be vectorized.
> >> @@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>                                (gimple_bb (reduc_def_phi)->loop_father));
> >>    unsigned reduc_chain_length = 0;
> >>    bool only_slp_reduc_chain = true;
> >> +  bool only_lane_reducing = true;
> >>    stmt_info = NULL;
> >>    slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
> >>    while (reduc_def != PHI_RESULT (reduc_def_phi))
> >> @@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>               return false;
> >>             }
> >>         }
> >> -      else if (!stmt_info)
> >> -       /* First non-conversion stmt.  */
> >> -       stmt_info = vdef;
> >> +      else
> >> +       {
> >> +         /* First non-conversion stmt.  */
> >> +         if (!stmt_info)
> >> +           stmt_info = vdef;
> >> +
> >> +         if (!lane_reducing_op_p (op.code))
> >> +           only_lane_reducing = false;
> >> +       }
> >> +
> >>        reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> >>        reduc_chain_length++;
> >>        if (!stmt_info && slp_node)
> >> @@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>    if (!type_has_mode_precision_p (op.type))
> >>      return false;
> >>
> >> -  /* For lane-reducing ops we're reducing the number of reduction PHIs
> >> -     which means the only use of that may be in the lane-reducing operation.  */
> >> -  if (lane_reducing
> >> -      && reduc_chain_length != 1
> >> -      && !only_slp_reduc_chain)
> >> -    {
> >> -      if (dump_enabled_p ())
> >> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> -                        "lane-reducing reduction with extra stmts.\n");
> >> -      return false;
> >> -    }
> >> -
> >>    /* Lane-reducing ops also never can be used in a SLP reduction group
> >>       since we'll mix lanes belonging to different reductions.  But it's
> >>       OK to use them in a reduction chain or when the reduction group
> >> @@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>                              "use not simple.\n");
> >>           return false;
> >>         }
> >> -      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> >> -       continue;
> >> -
> >
> > So within this loop we analyze the "main" operation, while I do not exactly
> > remember why we skip the op leading to the PHI I don't understand why you
> > want to look at it for the multi lane-reducing case (the accumulator
> > always has the same type, no?).
> >
> > In any case this just looks at a single (the last) lane-reducing or even
> > not lane-reducing op.
> >
>
> This comparison is redundant, since it could be covered by the following
> comparison statement. The change should have been placed to a separate
> patch, but for convenience I made it here.
>
>       /* For an IFN_COND_OP we might hit the reduction definition operand
>          twice (once as definition, once as else).  */
>       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
>         continue;
>
>       /* There should be only one cycle def in the stmt, the one
>          leading to reduc_def.  */
>       if (VECTORIZABLE_CYCLE_DEF (dt))
>         return false;
>
> >>        /* For an IFN_COND_OP we might hit the reduction definition operand
> >>          twice (once as definition, once as else).  */
> >>        if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> >> @@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>      }
> >>    if (!vectype_in)
> >>      vectype_in = STMT_VINFO_VECTYPE (phi_info);
> >> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> >>
> >> -  /* Each lane-reducing operation has its own input vectype, while reduction
> >> -     PHI records the input vectype with least lanes.  */
> >> -  if (lane_reducing)
> >> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >> -
> >> -  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
> >> -  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
> >> +  /* If there is a normal (non-lane-reducing) operation in the loop reduction
> >> +     path, to ensure there will be enough copies to hold vectorized results of
> >> +     the operation, we need set the input vectype of the reduction PHI to be
> >> +     same as the reduction output vectype somewhere, here is a suitable place.
> >> +     Otherwise the input vectype is set to the one with the least lanes, which
> >> +     can only be determined in vectorizable analysis routine of lane-reducing
> >> +     operation.  */
> >
> > But we are using vectype_in to compute ncopies which is used in cost analysis.
>
> The vectype_in only impacts the cost analysis for lane-reducing op, since the
> function vect_is_emulated_mixed_dot_prod need it, and this function is referred
> by cost analysis. In the previous patch, we bind the vectype_in to each
> lane-reducing op and also adjust code of the function accordingly, then this
> would not be a problem.
>
> > You say this might not be the final ncopies?  Note the vectorization factor is
> > already fixed as well as (output) vector types of the lane-reducing ops.  So
>
> The vectype_in is incrementally updated during analyzing vectorizablility of
> lane-reducing ops. So before transform, the type should be determined.
>
> > shouldn't we simply pick that up in the loop walking the use-def chain via
> > REDUC_IDX at the start of this function?
>
> I thought about doing it in that way. Ok. will consider it again.
>
> > I'm unsure as to why we need
> > STMT_VINFO_REDUC_VECTYPE_IN at all (I don't remember adding that),
> > it should be readily available from operand analysis.  The docs for that
> > isn't very enlightening either (there's also REDUC_VECTYPE, in addition
> > to VECTYPE - huh).
>
> For old code, in which only one lane-reducing op is allowed in loop
> reduction, this type might be computed on-demand.
>
> But for multiple lane-reducing ops, we need to know the vectype_in types
> of all ops in order to determine a proper vectype_in for PHI statement, if
> traversing those ops and computing types on-demand would not a good
> way.  Additionally, during transform, originally cfg flow is broken and could
> not be used.
>
> >> +  if (!only_lane_reducing)
> >> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
> >> +
> >> +  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
> >> +  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> >>    /* If we have a condition reduction, see if we can simplify it further.  */
> >> -  if (v_reduc_type == COND_REDUCTION)
> >> +  if (reduction_type == COND_REDUCTION)
> >>      {
> >>        if (slp_node)
> >>         return false;
> >> @@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>      }
> >>
> >>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
> >> +  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>
> >> -  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>    if (reduction_type == TREE_CODE_REDUCTION)
> >>      {
> >>        /* Check whether it's ok to change the order of the computation.
> >> @@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>        && loop_vinfo->suggested_unroll_factor == 1)
> >>      single_defuse_cycle = true;
> >>
> >> -  if (single_defuse_cycle || lane_reducing)
> >> +  if (single_defuse_cycle && !lane_reducing)
> >>      {
> >>        gcc_assert (op.code != COND_EXPR);
> >>
> >> -      /* 4. Supportable by target?  */
> >> -      bool ok = true;
> >> -
> >> -      /* 4.1. check support for the operation in the loop
> >> +      /* 4. check support for the operation in the loop
> >>
> >>          This isn't necessary for the lane reduction codes, since they
> >>          can only be produced by pattern matching, and it's up to the
> >> @@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>          mixed-sign dot-products can be implemented using signed
> >>          dot-products.  */
> >>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> >> -      if (!lane_reducing
> >> -         && !directly_supported_p (op.code, vectype_in, optab_vector))
> >> +      if (!directly_supported_p (op.code, vectype_in, optab_vector))
> >>          {
> >>            if (dump_enabled_p ())
> >>              dump_printf (MSG_NOTE, "op not supported by target.\n");
> >>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
> >>               || !vect_can_vectorize_without_simd_p (op.code))
> >> -           ok = false;
> >> +           single_defuse_cycle = false;
> >>           else
> >>             if (dump_enabled_p ())
> >>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
> >> @@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
> >>           return false;
> >>         }
> >> -
> >> -      /* lane-reducing operations have to go through vect_transform_reduction.
> >> -         For the other cases try without the single cycle optimization.  */
> >> -      if (!ok)
> >> -       {
> >> -         if (lane_reducing)
> >> -           return false;
> >> -         else
> >> -           single_defuse_cycle = false;
> >> -       }
> >>      }
> >>    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
> >>
> >> -  /* If the reduction stmt is one of the patterns that have lane
> >> -     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
> >> -  if ((ncopies > 1 && ! single_defuse_cycle)
> >> -      && lane_reducing)
> >> -    {
> >> -      if (dump_enabled_p ())
> >> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> -                        "multi def-use cycle not possible for lane-reducing "
> >> -                        "reduction operation\n");
> >> -      return false;
> >> -    }
> >> -
> >> -  if (slp_node
> >> -      && !(!single_defuse_cycle
> >> -          && !lane_reducing
> >> -          && reduction_type != FOLD_LEFT_REDUCTION))
> >> +  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
> >> +     below processing will be done in its own vectorizable function.  */
> >> +  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
> >>      for (i = 0; i < (int) op.num_ops; i++)
> >>        if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
> >>         {
> >> @@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>    vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
> >>                              reduction_type, ncopies, cost_vec);
> >>    /* Cost the reduction op inside the loop if transformed via
> >> -     vect_transform_reduction.  Otherwise this is costed by the
> >> -     separate vectorizable_* routines.  */
> >> -  if (single_defuse_cycle || lane_reducing)
> >> -    {
> >> -      int factor = 1;
> >> -      if (vect_is_emulated_mixed_dot_prod (stmt_info))
> >> -       /* Three dot-products and a subtraction.  */
> >> -       factor = 4;
> >> -      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
> >> -                       stmt_info, 0, vect_body);
> >> -    }
> >> +     vect_transform_reduction for non-lane-reducing operation.  Otherwise
> >> +     this is costed by the separate vectorizable_* routines.  */
> >> +  if (single_defuse_cycle && !lane_reducing)
> >> +    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
> >>
> >>    if (dump_enabled_p ()
> >>        && reduction_type == FOLD_LEFT_REDUCTION)
> >>      dump_printf_loc (MSG_NOTE, vect_location,
> >>                      "using an in-order (fold-left) reduction.\n");
> >>    STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
> >> -  /* All but single defuse-cycle optimized, lane-reducing and fold-left
> >> -     reductions go through their own vectorizable_* routines.  */
> >> -  if (!single_defuse_cycle
> >> -      && !lane_reducing
> >> -      && reduction_type != FOLD_LEFT_REDUCTION)
> >> +
> >> +  /* All but single defuse-cycle optimized and fold-left reductions go
> >> +     through their own vectorizable_* routines.  */
> >> +  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
> >> +      || lane_reducing)
>
> >
> > So single-def-use-cycle but lane-reducing ops no longer need
> > to go through vect_transform_reduction?  How do you handle those
> > but fail to handle non-lane-reducing ops this way?
>
> Emm, all kinds of lane-reducing ops will go into vectorizable_lane_reducing(),
> no matter it is single-def-use or not, at that function, the STMT_VINFO_TYPE
> is set to reduc_vec_info_type, so transform will be done inside
> vect_transform_reduction.
>
> >
> >>      {
> >>        stmt_vec_info tem
> >>         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
> >> @@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> >>    int i;
> >>    int ncopies;
> >> +  int stmt_ncopies;
> >>    int vec_num;
> >>
> >>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> >> @@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
> >>    int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
> >>    tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> >> +  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
> >> +
> >> +  /* Get input vectypes from the reduction PHI and the statement to be
> >> +     transformed, these two vectypes may have different lanes when
> >> +     lane-reducing operation is present.  */
> >> +  if (!vectype_in)
> >> +    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
> >> +
> >> +  if (!stmt_vectype_in)
> >> +    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
> >>
> >>    if (slp_node)
> >>      {
> >>        ncopies = 1;
> >> +      stmt_ncopies = 1;
> >>        vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> >>      }
> >>    else
> >>      {
> >>        ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> >> +      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
> >> +      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
> >>        vec_num = 1;
> >>      }
> >>
> >> @@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>
> >>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> >>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> >> -  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> >> -
> >> +  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> >> +                                                   stmt_vectype_in);
> >>    /* Transform.  */
> >> -  tree new_temp = NULL_TREE;
> >> -  auto_vec<tree> vec_oprnds0;
> >> -  auto_vec<tree> vec_oprnds1;
> >> -  auto_vec<tree> vec_oprnds2;
> >> -  tree def0;
> >> +  auto_vec<tree> vec_oprnds[3];
> >>
> >>    if (dump_enabled_p ())
> >>      dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
> >> @@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
> >>      }
> >>
> >> -  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >> -
> >>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>    if (reduction_type == FOLD_LEFT_REDUCTION)
> >>      {
> >> @@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>        gcc_assert (code.is_tree_code () || cond_fn_p);
> >>        return vectorize_fold_left_reduction
> >>           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> >> -          code, reduc_fn, op.ops, op.num_ops, vectype_in,
> >> +          code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
> >>            reduc_index, masks, lens);
> >>      }
> >>
> >> @@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
> >>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
> >>
> >> -  /* Get NCOPIES vector definitions for all operands except the reduction
> >> -     definition.  */
> >> -  if (!cond_fn_p)
> >> +  gcc_assert (reduc_index < 3);
> >> +
> >> +  if (slp_node)
> >>      {
> >> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> >> -                        single_defuse_cycle && reduc_index == 0
> >> -                        ? NULL_TREE : op.ops[0], &vec_oprnds0,
> >> -                        single_defuse_cycle && reduc_index == 1
> >> -                        ? NULL_TREE : op.ops[1], &vec_oprnds1,
> >> -                        op.num_ops == 3
> >> -                        && !(single_defuse_cycle && reduc_index == 2)
> >> -                        ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> >> +      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
> >
> > I think that's going to fail.  Mind v3 of the series I posted to enable
> > SLP discovery for single-lane reductions.  Basically everything is
> > going to be SLP for GCC 15.
> >
>
> Have the v3 already been in the trunk? Then by default, any statement that has
> no isomorphic partner will become a single-lane SLP node?  And for such node,
> can I just reuse the old non-SLP transformation code?

As of this morning, r15-1006-gd93353e6423eca, it is on trunk.  Note the fallback
is still non-SLP in case vectorizable_reduction FAILs with SLP.  I have a set of
changes queued to allow some more kind of reductions with SLP but IIRC the
lane-reducing variant is already supported.

Richard.

> >> +
> >> +      for (i = 0; i < (int) op.num_ops; i++)
> >> +       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
> >>      }
> >>    else
> >>      {
> >> -      /* For a conditional operation pass the truth type as mask
> >> -        vectype.  */
> >> -      gcc_assert (single_defuse_cycle
> >> -                 && (reduc_index == 1 || reduc_index == 2));
> >> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> >> -                        op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
> >> -                        reduc_index == 1 ? NULL_TREE : op.ops[1],
> >> -                        NULL_TREE, &vec_oprnds1,
> >> -                        reduc_index == 2 ? NULL_TREE : op.ops[2],
> >> -                        NULL_TREE, &vec_oprnds2);
> >> -    }
> >> +      /* The input vectype of the reduction PHI determines copies of
> >> +        vectorized def-use cycles, which might be more than effective copies
> >> +        of vectorized lane-reducing reduction statements.  This could be
> >> +        complemented by generating extra trivial pass-through copies.  For
> >> +        example:
> >> +
> >
> > That also means you need to handle SLP here, but you can assert there's
> > only a single lane.
> >
> > Btw, you can push the patches I approved if they independently test OK.
> >
>
> >> +          int sum = 0;
> >> +          for (i)
> >> +            {
> >> +              sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> >> +              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
> >> +              sum += n[i];               // normal <vector(4) int>
> >> +            }
> >> +
> >> +        The vector size is 128-bit，vectorization factor is 16.  Reduction
> >> +        statements would be transformed as:
> >> +
> >> +          vector<4> int sum_v0 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v1 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v2 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v3 = { 0, 0, 0, 0 };
> >> +
> >> +          for (i / 16)
> >> +            {
> >> +              sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> >> +              sum_v1 = sum_v1;  // copy
> >> +              sum_v2 = sum_v2;  // copy
> >> +              sum_v3 = sum_v3;  // copy
> >> +
> >> +              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> >> +              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> >> +              sum_v2 = sum_v2;  // copy
> >> +              sum_v3 = sum_v3;  // copy
> >> +
> >> +              sum_v0 += n_v0[i: 0  ~ 3 ];
> >> +              sum_v1 += n_v1[i: 4  ~ 7 ];
> >> +              sum_v2 += n_v2[i: 8  ~ 11];
> >> +              sum_v3 += n_v3[i: 12 ~ 15];
> >> +            }
> >> +       */
> >> +
> >> +      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
> >> +       {
> >> +         tree vectype = NULL_TREE;
> >> +         int used_ncopies = ncopies;
> >> +
> >> +         if (cond_fn_p && i == 0)
> >> +           {
> >> +             /* For a conditional operation pass the truth type as mask
> >> +                vectype.  */
> >> +             gcc_assert (single_defuse_cycle && reduc_index > 0);
> >> +             vectype = truth_type_for (vectype_in);
> >> +           }
> >>
> >> -  /* For single def-use cycles get one copy of the vectorized reduction
> >> -     definition.  */
> >> -  if (single_defuse_cycle)
> >> -    {
> >> -      gcc_assert (!slp_node);
> >> -      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> >> -                                    op.ops[reduc_index],
> >> -                                    reduc_index == 0 ? &vec_oprnds0
> >> -                                    : (reduc_index == 1 ? &vec_oprnds1
> >> -                                       : &vec_oprnds2));
> >> +         if (i != reduc_index)
> >> +           {
> >> +             /* For non-reduction operand, deduce effictive copies that are
> >> +                involved in vectorized def-use cycles based on the input
> >> +                vectype of the reduction statement.  */
> >> +             used_ncopies = stmt_ncopies;
> >> +           }
> >> +         else if (single_defuse_cycle)
> >> +           {
> >> +             /* For single def-use cycles get one copy of the vectorized
> >> +                reduction definition.  */
> >> +             used_ncopies = 1;
> >> +           }
> >> +
> >> +         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
> >> +                                        op.ops[i], &vec_oprnds[i], vectype);
> >> +
> >> +         if (used_ncopies < ncopies)
> >> +           vec_oprnds[i].safe_grow_cleared (ncopies);
> >> +       }
> >>      }
> >>
> >> +  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >>    bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> >> +  tree def0;
> >>
> >> -  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
> >> +  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
> >>      {
> >>        gimple *new_stmt;
> >> -      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
> >> -      if (masked_loop_p && !mask_by_cond_expr)
> >> +      tree new_temp = NULL_TREE;
> >> +      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
> >> +
> >> +      if (!vop[0] || !vop[1])
> >> +       {
> >> +         tree reduc_vop = vec_oprnds[reduc_index][i];
> >> +
> >> +         /* Insert trivial copy if no need to generate vectorized
> >> +            statement.  */
> >> +         gcc_assert (reduc_vop && stmt_ncopies < ncopies);
> >> +
> >> +         new_stmt = gimple_build_assign (vec_dest, reduc_vop);
> >> +         new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +         gimple_set_lhs (new_stmt, new_temp);
> >> +         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> >> +       }
> >> +      else if (masked_loop_p && !mask_by_cond_expr)
> >>         {
> >> -         /* No conditional ifns have been defined for dot-product yet.  */
> >> -         gcc_assert (code != DOT_PROD_EXPR);
> >> +         /* No conditional ifns have been defined for dot-product and sad
> >> +            yet.  */
> >> +         gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
> >>
> >>           /* Make sure that the reduction accumulator is vop[0].  */
> >>           if (reduc_index == 1)
> >> @@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>               std::swap (vop[0], vop[1]);
> >>             }
> >>           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> >> -                                         vec_num * ncopies, vectype_in, i);
> >> +                                         vec_num * stmt_ncopies,
> >> +                                         stmt_vectype_in, i);
> >>           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
> >>                                                     vop[0], vop[1], vop[0]);
> >>           new_temp = make_ssa_name (vec_dest, call);
> >> @@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>        else
> >>         {
> >>           if (op.num_ops >= 3)
> >> -           vop[2] = vec_oprnds2[i];
> >> +           vop[2] = vec_oprnds[2][i];
> >>
> >>           if (masked_loop_p && mask_by_cond_expr)
> >>             {
> >>               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> >> -                                             vec_num * ncopies, vectype_in, i);
> >> +                                             vec_num * stmt_ncopies,
> >> +                                             stmt_vectype_in, i);
> >>               build_vect_cond_expr (code, vop, mask, gsi);
> >>             }
> >>
> >> @@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>
> >>        if (slp_node)
> >>         slp_node->push_vec_def (new_stmt);
> >> -      else if (single_defuse_cycle
> >> -              && i < ncopies - 1)
> >> -       {
> >> -         if (reduc_index == 0)
> >> -           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
> >> -         else if (reduc_index == 1)
> >> -           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
> >> -         else if (reduc_index == 2)
> >> -           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
> >> -       }
> >> +      else if (single_defuse_cycle && i < ncopies - 1)
> >> +       vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
> >>        else
> >>         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >>      }
> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> >> index 2e0be763abb..cc0a832f71b 100644
> >> --- a/gcc/tree-vect-stmts.cc
> >> +++ b/gcc/tree-vect-stmts.cc
> >> @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo,
> >>                                       NULL, NULL, node, cost_vec)
> >>           || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> >>           || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> >> +         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
> >> +                                        stmt_info, node, cost_vec)
> >>           || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
> >>                                      node, node_instance, cost_vec)
> >>           || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
> >> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> >> index 97ec9c341e7..ca810869592 100644
> >> --- a/gcc/tree-vectorizer.h
> >> +++ b/gcc/tree-vectorizer.h
> >> @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
> >>  extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
> >>                                          slp_tree, slp_instance, int,
> >>                                          bool, stmt_vector_for_cost *);
> >> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
> >> +                                       slp_tree, stmt_vector_for_cost *);
> >>  extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
> >>                                     slp_tree, slp_instance,
> >>                                     stmt_vector_for_cost *);
> >> --
> >> 2.17.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]
  2024-06-04 13:17     ` Richard Biener
@ 2024-06-14  4:00       ` Feng Xue OS
  0 siblings, 0 replies; 5+ messages in thread
From: Feng Xue OS @ 2024-06-14  4:00 UTC (permalink / raw)
  To: Richard Biener; +Cc: Tamar Christina, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 96131 bytes --]

Updated the patch for single-lane slp node support. And determine input vectype for reduction PHI during traversal of reduction statements.

Thanks,
Feng
---

gcc/
        PR tree-optimization/114440
        * tree-vectorizer.h (vectorizable_lane_reducing): New function
        declaration.
        * tree-vect-stmts.cc (vect_analyze_stmt): Call new function
        vectorizable_lane_reducing to analyze lane-reducing operation.
        * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
        code related to emulated_mixed_dot_prod.
        (vect_reduction_update_partial_vector_usage): Compute ncopies as the
        original means for single-lane slp node.
        (vectorizable_lane_reducing): New function.
        (vectorizable_reduction): Allow multiple lane-reducing operations in
        loop reduction. Move some original lane-reducing related code to
        vectorizable_lane_reducing.
        (vect_transform_reduction): Extend transformation to support reduction
        statements with mixed input vectypes.

gcc/testsuite/
        PR tree-optimization/114440
        * gcc.dg/vect/vect-reduc-chain-1.c
        * gcc.dg/vect/vect-reduc-chain-2.c
        * gcc.dg/vect/vect-reduc-chain-3.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
        * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
        * gcc.dg/vect/vect-reduc-dot-slp-1.c

temp
---
 .../gcc.dg/vect/vect-reduc-chain-1.c          |  62 +++
 .../gcc.dg/vect/vect-reduc-chain-2.c          |  77 +++
 .../gcc.dg/vect/vect-reduc-chain-3.c          |  66 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  95 ++++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  67 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c  |  79 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c  |  63 +++
 .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  35 ++
 gcc/tree-vect-loop.cc                         | 501 ++++++++++++------
 gcc/tree-vect-stmts.cc                        |   2 +
 gcc/tree-vectorizer.h                         |   2 +
 11 files changed, 888 insertions(+), 161 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
new file mode 100644
index 00000000000..04bfc419dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_2 char *restrict c,
+   SIGNEDNESS_2 char *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_2 char c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = BASE + i * 2;
+      d[i] = BASE + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
new file mode 100644
index 00000000000..6c803b80120
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
@@ -0,0 +1,77 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 char *restrict c,
+   SIGNEDNESS_3 char *restrict d,
+   SIGNEDNESS_4 short *restrict e,
+   SIGNEDNESS_4 short *restrict f,
+   SIGNEDNESS_1 int *restrict g)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += i + 1;
+      res += c[i] * d[i];
+      res += e[i] * f[i];
+      res += g[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 char c[N], d[N];
+  SIGNEDNESS_4 short e[N], f[N];
+  SIGNEDNESS_1 int g[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += i + 1;
+      expected += c[i] * d[i];
+      expected += e[i] * f[i];
+      expected += g[i];
+    }
+  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
new file mode 100644
index 00000000000..a41e4b176c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
@@ -0,0 +1,66 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 short *restrict c,
+   SIGNEDNESS_3 short *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      res += abs;
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 short c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 - i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      expected += abs;
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
new file mode 100644
index 00000000000..c2831fbcc8e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
@@ -0,0 +1,95 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+      res += a[8] * b[8];
+      res += a[9] * b[9];
+      res += a[10] * b[10];
+      res += a[11] * b[11];
+      res += a[12] * b[12];
+      res += a[13] * b[13];
+      res += a[14] * b[14];
+      res += a[15] * b[15];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int step = 16;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      expected += a[t + 8] * b[t + 8];
+      expected += a[t + 9] * b[t + 9];
+      expected += a[t + 10] * b[t + 10];
+      expected += a[t + 11] * b[t + 11];
+      expected += a[t + 12] * b[t + 12];
+      expected += a[t + 13] * b[t + 13];
+      expected += a[t + 14] * b[t + 14];
+      expected += a[t + 15] * b[t + 15];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
new file mode 100644
index 00000000000..4114264a364
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
@@ -0,0 +1,67 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[5 * i + 0] * b[5 * i + 0];
+      res += a[5 * i + 1] * b[5 * i + 1];
+      res += a[5 * i + 2] * b[5 * i + 2];
+      res += a[5 * i + 3] * b[5 * i + 3];
+      res += a[5 * i + 4] * b[5 * i + 4];
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int n = 18;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[5 * i + 0] * b[5 * i + 0];
+      expected += a[5 * i + 1] * b[5 * i + 1];
+      expected += a[5 * i + 2] * b[5 * i + 2];
+      expected += a[5 * i + 3] * b[5 * i + 3];
+      expected += a[5 * i + 4] * b[5 * i + 4];
+    }
+
+  if (f (0x12345, a, b, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 5 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
new file mode 100644
index 00000000000..2cdecc36d16
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
@@ -0,0 +1,79 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int step = 8;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
new file mode 100644
index 00000000000..32c0f30c77b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
@@ -0,0 +1,63 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[3 * i + 0] * b[3 * i + 0];
+      res += a[3 * i + 1] * b[3 * i + 1];
+      res += a[3 * i + 2] * b[3 * i + 2];
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int n = 18;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[3 * i + 0] * b[3 * i + 0];
+      expected += a[3 * i + 1] * b[3 * i + 1];
+      expected += a[3 * i + 2] * b[3 * i + 2];
+    }
+
+  if (f (0x12345, a, b, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 3 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
new file mode 100644
index 00000000000..e17d6291f75
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
@@ -0,0 +1,35 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res0,
+   SIGNEDNESS_1 int res1,
+   SIGNEDNESS_1 int res2,
+   SIGNEDNESS_1 int res3,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b)
+{
+  for (int i = 0; i < 64; i += 4)
+    {
+      res0 += a[i + 0] * b[i + 0];
+      res1 += a[i + 1] * b[i + 1];
+      res2 += a[i + 2] * b[i + 2];
+      res3 += a[i + 3] * b[i + 3];
+    }
+
+  return res0 ^ res1 ^ res2 ^ res3;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 35c50eb72cb..fb9259d115c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5324,8 +5324,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();

-  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -5360,12 +5358,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
           initial result of the data reduction, initial value of the index
           reduction.  */
        prologue_stmts = 4;
-      else if (emulated_mixed_dot_prod)
-       /* We need the initial reduction value and two invariants:
-          one that contains the minimum signed value and one that
-          contains half of its negative.  */
-       prologue_stmts = 3;
       else
+       /* We need the initial reduction value.  */
        prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
                                         scalar_to_vec, stmt_info, 0,
@@ -7466,7 +7460,7 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
       unsigned nvectors;

-      if (slp_node)
+      if (slp_node && SLP_TREE_LANES (slp_node) > 1)
        nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       else
        nvectors = vect_get_num_copies (loop_vinfo, vectype_in);
@@ -7478,6 +7472,150 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
     }
 }

+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
+   Now there are three such kinds of operations: dot-prod/widen-sum/sad
+   (sum-of-absolute-differences).
+
+   For a lane-reducing operation, the loop reduction path that it lies in,
+   may contain normal operation, or other lane-reducing operation of different
+   input type size, an example as:
+
+     int sum = 0;
+     for (i)
+       {
+         ...
+         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
+         sum += w[i];                // widen-sum <vector(16) char>
+         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
+         sum += n[i];                // normal <vector(4) int>
+         ...
+       }
+
+   Vectorization factor is essentially determined by operation whose input
+   vectype has the most lanes ("vector(16) char" in the example), while we
+   need to choose input vectype with the least lanes ("vector(4) int" in the
+   example) for the reduction PHI statement.  */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!stmt)
+    return false;
+
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+
+  if (!lane_reducing_op_p (code))
+    return false;
+
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
+    return false;
+
+  /* Do not try to vectorize bit-precision reductions.  */
+  if (!type_has_mode_precision_p (type))
+    return false;
+
+  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+    {
+      stmt_vec_info def_stmt_info;
+      slp_tree slp_op;
+      tree op;
+      tree vectype;
+      enum vect_def_type dt;
+
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+                              &slp_op, &dt, &vectype, &def_stmt_info))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "use not simple.\n");
+         return false;
+       }
+
+      if (!vectype)
+       {
+         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+                                                slp_op);
+         if (!vectype)
+           return false;
+       }
+
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
+
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+       continue;
+
+      /* There should be at most one cycle def in the stmt.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
+       return false;
+    }
+
+  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+  /* TODO: Support lane-reducing operation that does not directly participate
+     in loop reduction. */
+  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+    return false;
+
+  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+     recoginized.  */
+  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+  int ncopies_for_cost;
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
+    {
+      /* Now lane-reducing operations in a non-single-lane slp node should only
+        come from the same loop reduction path.  */
+      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
+      ncopies_for_cost = 1;
+    }
+  else
+    {
+      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
+      gcc_assert (ncopies_for_cost >= 1);
+    }
+
+  if (vect_is_emulated_mixed_dot_prod (stmt_info))
+    {
+      /* We need extra two invariants: one that contains the minimum signed
+        value and one that contains half of its negative.  */
+      int prologue_stmts = 2;
+      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+                                       scalar_to_vec, stmt_info, 0,
+                                       vect_prologue);
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+                    "extra prologue_cost = %d .\n", cost);
+
+      /* Three dot-products and a subtraction.  */
+      ncopies_for_cost *= 4;
+    }
+
+  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
+                   vect_body);
+
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+    vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
+                                               slp_node, code, type,
+                                               vectype_in);
+
+  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+  return true;
+}
+
 /* Function vectorizable_reduction.

    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7643,7 +7781,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
-      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
+      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
+
+      if (reduc_idx == -1)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7689,10 +7829,43 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
              return false;
            }
        }
-      else if (!stmt_info)
-       /* First non-conversion stmt.  */
-       stmt_info = vdef;
-      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
+      else
+       {
+         /* First non-conversion stmt.  */
+         if (!stmt_info)
+           stmt_info = vdef;
+
+         if (lane_reducing_op_p (op.code))
+           {
+             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
+             tree op_type = TREE_TYPE (op.ops[0]);
+             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
+                                                                op_type,
+                                                                group_size);
+
+             /* The last operand of lane-reducing operation must be addend
+                for reduction.  */
+             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
+
+             if (!new_vectype_in)
+               return false;
+
+             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
+
+             /* To accommodate lane-reducing operations of mixed input
+                vectypes, choose input vectype with the least lanes for the
+                reduction PHI statement, which would result in the most
+                ncopies for vectorized reduction results.  */
+             if (!vectype_in
+                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
+               vectype_in = new_vectype_in;
+           }
+         else
+           vectype_in = STMT_VINFO_VECTYPE (phi_info);
+       }
+
+      reduc_def = op.ops[reduc_idx];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
        slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
@@ -7750,6 +7923,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,

   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7763,18 +7938,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (!type_has_mode_precision_p (op.type))
     return false;

-  /* For lane-reducing ops we're reducing the number of reduction PHIs
-     which means the only use of that may be in the lane-reducing operation.  */
-  if (lane_reducing
-      && reduc_chain_length != 1
-      && !only_slp_reduc_chain)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "lane-reducing reduction with extra stmts.\n");
-      return false;
-    }
-
   /* Lane-reducing ops also never can be used in a SLP reduction group
      since we'll mix lanes belonging to different reductions.  But it's
      OK to use them in a reduction chain or when the reduction group
@@ -7818,9 +7981,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
                             "use not simple.\n");
          return false;
        }
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-       continue;
-
       /* For an IFN_COND_OP we might hit the reduction definition operand
         twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
@@ -7836,16 +7996,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
          = get_vectype_for_scalar_type (loop_vinfo,
                                         TREE_TYPE (op.ops[i]), slp_op[i]);

-      /* To properly compute ncopies we are interested in the widest
-        non-reduction input type in case we're looking at a widening
-        accumulation that we later handle in vect_transform_reduction.  */
-      if (lane_reducing
-         && vectype_op[i]
-         && (!vectype_in
-             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
-       vectype_in = vectype_op[i];
-
       /* Record how the non-reduction-def value of COND_EXPR is defined.
         ???  For a chain of multiple CONDs we'd have to match them up all.  */
       if (op.code == COND_EXPR && reduc_chain_length == 1)
@@ -7864,19 +8014,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
            }
        }
     }
-  if (!vectype_in)
-    vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
-
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;

-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node && SLP_TREE_LANES (slp_node) != 1)
        return false;
@@ -8042,8 +8184,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }

   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);

-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.
@@ -8329,14 +8471,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;

-  if (single_defuse_cycle || lane_reducing)
+  if (single_defuse_cycle && !lane_reducing)
     {
       gcc_assert (op.code != COND_EXPR);

-      /* 4. Supportable by target?  */
-      bool ok = true;
-
-      /* 4.1. check support for the operation in the loop
+      /* 4. check support for the operation in the loop

         This isn't necessary for the lane reduction codes, since they
         can only be produced by pattern matching, and it's up to the
@@ -8345,14 +8484,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
         mixed-sign dot-products can be implemented using signed
         dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!lane_reducing
-         && !directly_supported_p (op.code, vectype_in, optab_vector))
+      if (!directly_supported_p (op.code, vectype_in, optab_vector))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
          if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
              || !vect_can_vectorize_without_simd_p (op.code))
-           ok = false;
+           single_defuse_cycle = false;
          else
            if (dump_enabled_p ())
              dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8365,35 +8503,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
            dump_printf (MSG_NOTE, "using word mode not possible.\n");
          return false;
        }
-
-      /* lane-reducing operations have to go through vect_transform_reduction.
-         For the other cases try without the single cycle optimization.  */
-      if (!ok)
-       {
-         if (lane_reducing)
-           return false;
-         else
-           single_defuse_cycle = false;
-       }
     }
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;

-  /* If the reduction stmt is one of the patterns that have lane
-     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1 && ! single_defuse_cycle)
-      && lane_reducing)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "multi def-use cycle not possible for lane-reducing "
-                        "reduction operation\n");
-      return false;
-    }
-
-  if (slp_node
-      && !(!single_defuse_cycle
-          && !lane_reducing
-          && reduction_type != FOLD_LEFT_REDUCTION))
+  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
+     below processing will be done in its own vectorizable function.  */
+  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
     for (i = 0; i < (int) op.num_ops; i++)
       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
        {
@@ -8406,28 +8521,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
                             reduction_type, ncopies, cost_vec);
   /* Cost the reduction op inside the loop if transformed via
-     vect_transform_reduction.  Otherwise this is costed by the
-     separate vectorizable_* routines.  */
-  if (single_defuse_cycle || lane_reducing)
-    {
-      int factor = 1;
-      if (vect_is_emulated_mixed_dot_prod (stmt_info))
-       /* Three dot-products and a subtraction.  */
-       factor = 4;
-      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
-                       stmt_info, 0, vect_body);
-    }
+     vect_transform_reduction for non-lane-reducing operation.  Otherwise
+     this is costed by the separate vectorizable_* routines.  */
+  if (single_defuse_cycle && !lane_reducing)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);

   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
                     "using an in-order (fold-left) reduction.\n");
   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
-  /* All but single defuse-cycle optimized, lane-reducing and fold-left
-     reductions go through their own vectorizable_* routines.  */
-  if (!single_defuse_cycle
-      && !lane_reducing
-      && reduction_type != FOLD_LEFT_REDUCTION)
+
+  /* All but single defuse-cycle optimized and fold-left reductions go
+     through their own vectorizable_* routines.  */
+  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
+      || lane_reducing)
     {
       stmt_vec_info tem
        = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8533,6 +8641,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
+  int stmt_ncopies;
   int vec_num;

   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -8556,15 +8665,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);

-  if (slp_node)
+  /* Get input vectypes from the reduction PHI and the statement to be
+     transformed, these two vectypes may have different lanes when
+     lane-reducing operation is present.  */
+  if (!vectype_in)
+    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
+
+  if (!stmt_vectype_in)
+    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
     {
       ncopies = 1;
+      stmt_ncopies = 1;
       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
     }
   else
     {
       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
+      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
       vec_num = 1;
     }

@@ -8573,14 +8695,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,

   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
-  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
-
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
+                                                   stmt_vectype_in);
   /* Transform.  */
-  tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
-  tree def0;
+  auto_vec<tree> vec_oprnds[3];

   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8604,8 +8722,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
                      == op.ops[internal_fn_else_index ((internal_fn) code)]));
     }

-  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
-
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -8613,7 +8729,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
          (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-          code, reduc_fn, op.ops, op.num_ops, vectype_in,
+          code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
           reduc_index, masks, lens);
     }

@@ -8624,55 +8740,124 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);

-  /* Get NCOPIES vector definitions for all operands except the reduction
-     definition.  */
-  if (!cond_fn_p)
+  gcc_assert (reduc_index < 3);
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
     {
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-                        single_defuse_cycle && reduc_index == 0
-                        ? NULL_TREE : op.ops[0], &vec_oprnds0,
-                        single_defuse_cycle && reduc_index == 1
-                        ? NULL_TREE : op.ops[1], &vec_oprnds1,
-                        op.num_ops == 3
-                        && !(single_defuse_cycle && reduc_index == 2)
-                        ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+      gcc_assert (!single_defuse_cycle);
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
     }
   else
     {
-      /* For a conditional operation pass the truth type as mask
-        vectype.  */
-      gcc_assert (single_defuse_cycle
-                 && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-                        op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
-                        reduc_index == 1 ? NULL_TREE : op.ops[1],
-                        NULL_TREE, &vec_oprnds1,
-                        reduc_index == 2 ? NULL_TREE : op.ops[2],
-                        NULL_TREE, &vec_oprnds2);
-    }
+      /* The input vectype of the reduction PHI determines copies of
+        vectorized def-use cycles, which might be more than effective copies
+        of vectorized lane-reducing reduction statements.  This could be
+        complemented by generating extra trivial pass-through copies.  For
+        example:
+
+          int sum = 0;
+          for (i)
+            {
+              sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
+              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+              sum += n[i];               // normal <vector(4) int>
+            }
+
+        The vector size is 128-bit?vectorization factor is 16.  Reduction
+        statements would be transformed as:
+
+          vector<4> int sum_v0 = { 0, 0, 0, 0 };
+          vector<4> int sum_v1 = { 0, 0, 0, 0 };
+          vector<4> int sum_v2 = { 0, 0, 0, 0 };
+          vector<4> int sum_v3 = { 0, 0, 0, 0 };
+
+          for (i / 16)
+            {
+              sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
+              sum_v1 = sum_v1;  // copy
+              sum_v2 = sum_v2;  // copy
+              sum_v3 = sum_v3;  // copy
+
+              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+              sum_v2 = sum_v2;  // copy
+              sum_v3 = sum_v3;  // copy
+
+              sum_v0 += n_v0[i: 0  ~ 3 ];
+              sum_v1 += n_v1[i: 4  ~ 7 ];
+              sum_v2 += n_v2[i: 8  ~ 11];
+              sum_v3 += n_v3[i: 12 ~ 15];
+            }
+       */
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+       {
+         tree vectype = NULL_TREE;
+         int used_ncopies = ncopies;
+
+         if (cond_fn_p && i == 0)
+           {
+             /* For a conditional operation pass the truth type as mask
+                vectype.  */
+             gcc_assert (single_defuse_cycle && reduc_index > 0);
+             vectype = truth_type_for (vectype_in);
+           }

-  /* For single def-use cycles get one copy of the vectorized reduction
-     definition.  */
-  if (single_defuse_cycle)
-    {
-      gcc_assert (!slp_node);
-      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-                                    op.ops[reduc_index],
-                                    reduc_index == 0 ? &vec_oprnds0
-                                    : (reduc_index == 1 ? &vec_oprnds1
-                                       : &vec_oprnds2));
+         if (i != reduc_index)
+           {
+             /* For non-reduction operand, deduce effictive copies that are
+                involved in vectorized def-use cycles based on the input
+                vectype of the reduction statement.  */
+             used_ncopies = stmt_ncopies;
+           }
+         else if (single_defuse_cycle)
+           {
+             /* For single def-use cycles get one copy of the vectorized
+                reduction definition.  */
+             used_ncopies = 1;
+           }
+
+         if (slp_node)
+           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
+         else
+           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
+                                          op.ops[i], &vec_oprnds[i], vectype);
+
+         if (used_ncopies < ncopies)
+           vec_oprnds[i].safe_grow_cleared (ncopies);
+       }
     }

+  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  tree def0;

-  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
     {
       gimple *new_stmt;
-      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-      if (masked_loop_p && !mask_by_cond_expr)
+      tree new_temp = NULL_TREE;
+      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
+
+      if (!vop[0] || !vop[1])
+       {
+         tree reduc_vop = vec_oprnds[reduc_index][i];
+
+         /* Insert trivial copy if no need to generate vectorized
+            statement.  */
+         gcc_assert (reduc_vop && stmt_ncopies < ncopies);
+
+         new_stmt = gimple_build_assign (vec_dest, reduc_vop);
+         new_temp = make_ssa_name (vec_dest, new_stmt);
+         gimple_set_lhs (new_stmt, new_temp);
+         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+       }
+      else if (masked_loop_p && !mask_by_cond_expr)
        {
-         /* No conditional ifns have been defined for dot-product yet.  */
-         gcc_assert (code != DOT_PROD_EXPR);
+         /* No conditional ifns have been defined for dot-product and sad
+            yet.  */
+         gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);

          /* Make sure that the reduction accumulator is vop[0].  */
          if (reduc_index == 1)
@@ -8681,7 +8866,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
              std::swap (vop[0], vop[1]);
            }
          tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-                                         vec_num * ncopies, vectype_in, i);
+                                         vec_num * stmt_ncopies,
+                                         stmt_vectype_in, i);
          gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
                                                    vop[0], vop[1], vop[0]);
          new_temp = make_ssa_name (vec_dest, call);
@@ -8693,12 +8879,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
        {
          if (op.num_ops >= 3)
-           vop[2] = vec_oprnds2[i];
+           vop[2] = vec_oprnds[2][i];

          if (masked_loop_p && mask_by_cond_expr)
            {
              tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-                                             vec_num * ncopies, vectype_in, i);
+                                             vec_num * stmt_ncopies,
+                                             stmt_vectype_in, i);
              build_vect_cond_expr (code, vop, mask, gsi);
            }

@@ -8725,16 +8912,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,

       if (slp_node)
        slp_node->push_vec_def (new_stmt);
-      else if (single_defuse_cycle
-              && i < ncopies - 1)
-       {
-         if (reduc_index == 0)
-           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-         else if (reduc_index == 1)
-           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-         else if (reduc_index == 2)
-           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-       }
+      else if (single_defuse_cycle && i < ncopies - 1)
+       vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
       else
        STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index dbdb59054e0..81036235a27 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13357,6 +13357,8 @@ vect_analyze_stmt (vec_info *vinfo,
                                      NULL, NULL, node, cost_vec)
          || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+                                        stmt_info, node, cost_vec)
          || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
                                     node, node_instance, cost_vec)
          || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6bb0f5c3a56..3f7db707d97 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
 extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
                                         slp_tree, slp_instance, int,
                                         bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+                                       slp_tree, stmt_vector_for_cost *);
 extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
                                    slp_tree, slp_instance,
                                    stmt_vector_for_cost *);
--
2.17.1

________________________________________
From: Richard Biener <richard.guenther@gmail.com>
Sent: Tuesday, June 4, 2024 9:17 PM
To: Feng Xue OS
Cc: Tamar Christina; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]

On Sun, Jun 2, 2024 at 4:13?PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> Please see my comments below.
>
> Thanks,
> Feng
>
> > On Thu, May 30, 2024 at 4:55?PM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
> >>
> >> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
> >> vectorizer could only handle the pattern if the reduction chain does not
> >> contain other operation, no matter the other is normal or lane-reducing.
> >>
> >> Actually, to allow multiple arbitray lane-reducing operations, we need to
> >> support vectorization of loop reduction chain with mixed input vectypes. Since
> >> lanes of vectype may vary with operation, the effective ncopies of vectorized
> >> statements for operation also may not be same to each other, this causes
> >> mismatch on vectorized def-use cycles. A simple way is to align all operations
> >> with the one that has the most ncopies, the gap could be complemented by
> >> generating extra trival pass-through copies. For example:
> >>
> >>    int sum = 0;
> >>    for (i)
> >>      {
> >>        sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> >>        sum += w[i];               // widen-sum <vector(16) char>
> >>        sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
> >>        sum += n[i];               // normal <vector(4) int>
> >>      }
> >>
> >> The vector size is 128-bit?vectorization factor is 16. Reduction statements
> >> would be transformed as:
> >>
> >>    vector<4> int sum_v0 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v1 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v2 = { 0, 0, 0, 0 };
> >>    vector<4> int sum_v3 = { 0, 0, 0, 0 };
> >>
> >>    for (i / 16)
> >>      {
> >>        sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> >>        sum_v1 = sum_v1;  // copy
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
> >>        sum_v1 = sum_v1;  // copy
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> >>        sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> >>        sum_v2 = sum_v2;  // copy
> >>        sum_v3 = sum_v3;  // copy
> >>
> >>        sum_v0 += n_v0[i: 0  ~ 3 ];
> >>        sum_v1 += n_v1[i: 4  ~ 7 ];
> >>        sum_v2 += n_v2[i: 8  ~ 11];
> >>        sum_v3 += n_v3[i: 12 ~ 15];
> >>      }
> >>
> >> Thanks,
> >> Feng
> >>
> >> ...
> >>
> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >> index 20c99f11e9a..b5849dbb08a 100644
> >> --- a/gcc/tree-vect-loop.cc
> >> +++ b/gcc/tree-vect-loop.cc
> >> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
> >>      gcc_unreachable ();
> >>
> >> -  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> >> -
> >>    if (reduction_type == EXTRACT_LAST_REDUCTION)
> >>      /* No extra instructions are needed in the prologue.  The loop body
> >>         operations are costed in vectorizable_condition.  */
> >> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>            initial result of the data reduction, initial value of the index
> >>            reduction.  */
> >>         prologue_stmts = 4;
> >> -      else if (emulated_mixed_dot_prod)
> >> -       /* We need the initial reduction value and two invariants:
> >> -          one that contains the minimum signed value and one that
> >> -          contains half of its negative.  */
> >> -       prologue_stmts = 3;
> >>        else
> >> +       /* We need the initial reduction value.  */
> >>         prologue_stmts = 1;
> >>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
> >>                                          scalar_to_vec, stmt_info, 0,
> >> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
> >>      }
> >>  }
> >>
> >> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
> >> +   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
> >> +   Now there are three such kinds of operations: dot-prod/widen-sum/sad
> >> +   (sum-of-absolute-differences).
> >> +
> >> +   For a lane-reducing operation, the loop reduction path that it lies in,
> >> +   may contain normal operation, or other lane-reducing operation of different
> >> +   input type size, an example as:
> >> +
> >> +     int sum = 0;
> >> +     for (i)
> >> +       {
> >> +         ...
> >> +         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
> >> +         sum += w[i];                // widen-sum <vector(16) char>
> >> +         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
> >> +         sum += n[i];                // normal <vector(4) int>
> >> +         ...
> >> +       }
> >> +
> >> +   Vectorization factor is essentially determined by operation whose input
> >> +   vectype has the most lanes ("vector(16) char" in the example), while we
> >> +   need to choose input vectype with the least lanes ("vector(4) int" in the
> >> +   example) for the reduction PHI statement.  */
> >> +
> >> +bool
> >> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
> >> +                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> >> +{
> >> +  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
> >> +  if (!stmt)
> >> +    return false;
> >> +
> >> +  enum tree_code code = gimple_assign_rhs_code (stmt);
> >> +
> >> +  if (!lane_reducing_op_p (code))
> >> +    return false;
> >
> > Can you make sure to return false if STMT_VINFO_REDUC_IDX == -1
> > thus the op is not part of a reduction chain/path?
> >
>
> As I planed, in the 2nd stage patches WIP, this function will also handle
> lane-reducing operation that does not directly participate reduction, like:
>
>  temp = dot_prod1 + dot_prod2;
>  sum += temp;
>
> In this case, STMT_VINFO_REDUC_IDX of dot_prod1/2 == -1
>
> For current work, the check is needed to filter out non-reduction statement,
> but since it is expected to be removed later, so the check is placed at a late
> point.
>
> >> +  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
> >> +
> >> +  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
> >> +    return false;
> >> +
> >> +  /* Do not try to vectorize bit-precision reductions.  */
> >> +  if (!type_has_mode_precision_p (type))
> >> +    return false;
> >> +
> >> +  tree vectype_in = NULL_TREE;
> >> +
> >> +  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
> >> +    {
> >> +      stmt_vec_info def_stmt_info;
> >> +      slp_tree slp_op;
> >> +      tree op;
> >> +      tree vectype;
> >> +      enum vect_def_type dt;
> >> +
> >> +      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
> >> +                              &slp_op, &dt, &vectype, &def_stmt_info))
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> +                            "use not simple.\n");
> >> +         return false;
> >> +       }
> >> +
> >> +      if (!vectype)
> >> +       {
> >> +         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
> >> +                                                slp_op);
> >> +         if (!vectype)
> >> +           return false;
> >> +       }
> >> +
> >> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
> >
> > Please avoid this during transform.
>
> This function is only for analysis not transform.
>
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> +                            "incompatible vector types for invariants\n");
> >> +         return false;
> >> +       }
> >> +
> >> +      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> >> +       continue;
> >> +
> >> +      /* There should be at most one cycle def in the stmt.  */
> >> +      if (VECTORIZABLE_CYCLE_DEF (dt))
> >> +       return false;
> >> +
> >> +      /* To properly compute ncopies we are interested in the widest
> >> +        non-reduction input type in case we're looking at a widening
> >> +        accumulation that we later handle in vect transformation.  */
> >> +      if (!vectype_in
> >> +         || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> >> +             < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
> >> +       vectype_in = vectype;
> >> +    }
> >> +
> >> +  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >
> > As said below I wonder where we would need STMT_VINFO_REDUC_VECTYPE_IN.
> > At least you should avoid re-setting this when !cost_vec aka during transform,
> > possibly instead asserting you re-compute the same type (or simply
> > skip the above
> > loop and set vectype_in from STMT_VINFO_REDUC_VECTYPE_IN which then
> > gets a good use).
>
> Likewise.
>
> >
> >> +  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
> >> +
> >> +  /* TODO: Support lane-reducing operation that does not directly participate
> >> +     in loop reduction. */
> >> +  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
> >> +    return false;
> >> +
> >> +  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
> >> +     recoginized.  */
> >> +  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
> >> +  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
> >> +
> >> +  tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> >> +
> >> +  /* To accommodate lane-reducing operations of mixed input vectypes, choose
> >> +     input vectype with the least lanes for the reduction PHI statement, which
> >> +     would result in the most ncopies for vectorized reduction results.  */
> >> +  if (!vphi_vectype_in
> >> +      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> >> +         > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
> >> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> >
> > Likewise.
> >
> >> +  int ncopies_for_cost;
> >> +
> >> +  if (slp_node)
> >> +    {
> >> +      /* Now lane-reducing operations in a slp node should only come from
> >> +        the same loop reduction path.  */
> >> +      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
> >> +      ncopies_for_cost = 1;
> >> +    }
> >> +  else
> >> +    {
> >> +      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
> >> +      gcc_assert (ncopies_for_cost >= 1);
> >> +    }
> >> +
> >> +  if (vect_is_emulated_mixed_dot_prod (stmt_info))
> >> +    {
> >> +      /* We need extra two invariants: one that contains the minimum signed
> >> +        value and one that contains half of its negative.  */
> >> +      int prologue_stmts = 2;
> >> +      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
> >> +                                       scalar_to_vec, stmt_info, 0,
> >> +                                       vect_prologue);
> >> +      if (dump_enabled_p ())
> >> +       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
> >> +                    "extra prologue_cost = %d .\n", cost);
> >> +
> >> +      /* Three dot-products and a subtraction.  */
> >> +      ncopies_for_cost *= 4;
> >> +    }
> >> +
> >> +  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
> >> +                   vect_body);
> >> +
> >> +  vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
> >> +                                    type, vectype_in);
> >> +
> >> +  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
> >
> > Uh, so those all go through vect_transform_reduction.  I see.
> >
> > I fail to see a check for whether the target supports the lane-reducing op.
> > vectorizable_reduction only checks the last one.  Currently the check
> > might be redundant with what pattern recognition checks but it's still
> > incomplete compared to the check in vectorizable_reduction.
>
> In the original vectorizable_reduction, the target support check is deliberately
> skipped for lane-reducing operations. The reason is part as you said, moreover,
> other check would always not be executed.
>
>   if (single_defuse_cycle || lane_reduc_code_p)
>     {
>       gcc_assert (op.code != COND_EXPR);
>
>       /* 4. Supportable by target?  */
>       bool ok = true;
>
>       /* 4.1. check support for the operation in the loop
>
>          This isn't necessary for the lane reduction codes, since they
>          can only be produced by pattern matching, and it's up to the
>          pattern matcher to test for support.  The main reason for
>          specifically skipping this step is to avoid rechecking whether
>          mixed-sign dot-products can be implemented using signed
>          dot-products.  */
>       machine_mode vec_mode = TYPE_MODE (vectype_in);
>       if (!lane_reduc_code_p                              //<----------- skip
>           && !directly_supported_p (op.code, vectype_in, optab_vector))
>         {
>           if (dump_enabled_p ())
>             dump_printf (MSG_NOTE, "op not supported by target.\n");
>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
>               || !vect_can_vectorize_without_simd_p (op.code))
>             ok = false;
>           else
>             if (dump_enabled_p ())
>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
>         }
>
>       // <----- always false for lane-reducing op
>
>       if (vect_emulated_vector_p (vectype_in)
>           && !vect_can_vectorize_without_simd_p (op.code))
>         {
>           if (dump_enabled_p ())
>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
>           return false;
>         }
>
> >
> >> +  return true;
> >> +}
> >> +
> >>  /* Function vectorizable_reduction.
> >>
> >>     Check if STMT_INFO performs a reduction operation that can be vectorized.
> >> @@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>                                (gimple_bb (reduc_def_phi)->loop_father));
> >>    unsigned reduc_chain_length = 0;
> >>    bool only_slp_reduc_chain = true;
> >> +  bool only_lane_reducing = true;
> >>    stmt_info = NULL;
> >>    slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
> >>    while (reduc_def != PHI_RESULT (reduc_def_phi))
> >> @@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>               return false;
> >>             }
> >>         }
> >> -      else if (!stmt_info)
> >> -       /* First non-conversion stmt.  */
> >> -       stmt_info = vdef;
> >> +      else
> >> +       {
> >> +         /* First non-conversion stmt.  */
> >> +         if (!stmt_info)
> >> +           stmt_info = vdef;
> >> +
> >> +         if (!lane_reducing_op_p (op.code))
> >> +           only_lane_reducing = false;
> >> +       }
> >> +
> >>        reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> >>        reduc_chain_length++;
> >>        if (!stmt_info && slp_node)
> >> @@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>    if (!type_has_mode_precision_p (op.type))
> >>      return false;
> >>
> >> -  /* For lane-reducing ops we're reducing the number of reduction PHIs
> >> -     which means the only use of that may be in the lane-reducing operation.  */
> >> -  if (lane_reducing
> >> -      && reduc_chain_length != 1
> >> -      && !only_slp_reduc_chain)
> >> -    {
> >> -      if (dump_enabled_p ())
> >> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> -                        "lane-reducing reduction with extra stmts.\n");
> >> -      return false;
> >> -    }
> >> -
> >>    /* Lane-reducing ops also never can be used in a SLP reduction group
> >>       since we'll mix lanes belonging to different reductions.  But it's
> >>       OK to use them in a reduction chain or when the reduction group
> >> @@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>                              "use not simple.\n");
> >>           return false;
> >>         }
> >> -      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> >> -       continue;
> >> -
> >
> > So within this loop we analyze the "main" operation, while I do not exactly
> > remember why we skip the op leading to the PHI I don't understand why you
> > want to look at it for the multi lane-reducing case (the accumulator
> > always has the same type, no?).
> >
> > In any case this just looks at a single (the last) lane-reducing or even
> > not lane-reducing op.
> >
>
> This comparison is redundant, since it could be covered by the following
> comparison statement. The change should have been placed to a separate
> patch, but for convenience I made it here.
>
>       /* For an IFN_COND_OP we might hit the reduction definition operand
>          twice (once as definition, once as else).  */
>       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
>         continue;
>
>       /* There should be only one cycle def in the stmt, the one
>          leading to reduc_def.  */
>       if (VECTORIZABLE_CYCLE_DEF (dt))
>         return false;
>
> >>        /* For an IFN_COND_OP we might hit the reduction definition operand
> >>          twice (once as definition, once as else).  */
> >>        if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> >> @@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>      }
> >>    if (!vectype_in)
> >>      vectype_in = STMT_VINFO_VECTYPE (phi_info);
> >> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> >>
> >> -  /* Each lane-reducing operation has its own input vectype, while reduction
> >> -     PHI records the input vectype with least lanes.  */
> >> -  if (lane_reducing)
> >> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >> -
> >> -  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
> >> -  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
> >> +  /* If there is a normal (non-lane-reducing) operation in the loop reduction
> >> +     path, to ensure there will be enough copies to hold vectorized results of
> >> +     the operation, we need set the input vectype of the reduction PHI to be
> >> +     same as the reduction output vectype somewhere, here is a suitable place.
> >> +     Otherwise the input vectype is set to the one with the least lanes, which
> >> +     can only be determined in vectorizable analysis routine of lane-reducing
> >> +     operation.  */
> >
> > But we are using vectype_in to compute ncopies which is used in cost analysis.
>
> The vectype_in only impacts the cost analysis for lane-reducing op, since the
> function vect_is_emulated_mixed_dot_prod need it, and this function is referred
> by cost analysis. In the previous patch, we bind the vectype_in to each
> lane-reducing op and also adjust code of the function accordingly, then this
> would not be a problem.
>
> > You say this might not be the final ncopies?  Note the vectorization factor is
> > already fixed as well as (output) vector types of the lane-reducing ops.  So
>
> The vectype_in is incrementally updated during analyzing vectorizablility of
> lane-reducing ops. So before transform, the type should be determined.
>
> > shouldn't we simply pick that up in the loop walking the use-def chain via
> > REDUC_IDX at the start of this function?
>
> I thought about doing it in that way. Ok. will consider it again.
>
> > I'm unsure as to why we need
> > STMT_VINFO_REDUC_VECTYPE_IN at all (I don't remember adding that),
> > it should be readily available from operand analysis.  The docs for that
> > isn't very enlightening either (there's also REDUC_VECTYPE, in addition
> > to VECTYPE - huh).
>
> For old code, in which only one lane-reducing op is allowed in loop
> reduction, this type might be computed on-demand.
>
> But for multiple lane-reducing ops, we need to know the vectype_in types
> of all ops in order to determine a proper vectype_in for PHI statement, if
> traversing those ops and computing types on-demand would not a good
> way.  Additionally, during transform, originally cfg flow is broken and could
> not be used.
>
> >> +  if (!only_lane_reducing)
> >> +    STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
> >> +
> >> +  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
> >> +  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> >>    /* If we have a condition reduction, see if we can simplify it further.  */
> >> -  if (v_reduc_type == COND_REDUCTION)
> >> +  if (reduction_type == COND_REDUCTION)
> >>      {
> >>        if (slp_node)
> >>         return false;
> >> @@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>      }
> >>
> >>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
> >> +  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>
> >> -  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>    if (reduction_type == TREE_CODE_REDUCTION)
> >>      {
> >>        /* Check whether it's ok to change the order of the computation.
> >> @@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>        && loop_vinfo->suggested_unroll_factor == 1)
> >>      single_defuse_cycle = true;
> >>
> >> -  if (single_defuse_cycle || lane_reducing)
> >> +  if (single_defuse_cycle && !lane_reducing)
> >>      {
> >>        gcc_assert (op.code != COND_EXPR);
> >>
> >> -      /* 4. Supportable by target?  */
> >> -      bool ok = true;
> >> -
> >> -      /* 4.1. check support for the operation in the loop
> >> +      /* 4. check support for the operation in the loop
> >>
> >>          This isn't necessary for the lane reduction codes, since they
> >>          can only be produced by pattern matching, and it's up to the
> >> @@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>          mixed-sign dot-products can be implemented using signed
> >>          dot-products.  */
> >>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> >> -      if (!lane_reducing
> >> -         && !directly_supported_p (op.code, vectype_in, optab_vector))
> >> +      if (!directly_supported_p (op.code, vectype_in, optab_vector))
> >>          {
> >>            if (dump_enabled_p ())
> >>              dump_printf (MSG_NOTE, "op not supported by target.\n");
> >>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
> >>               || !vect_can_vectorize_without_simd_p (op.code))
> >> -           ok = false;
> >> +           single_defuse_cycle = false;
> >>           else
> >>             if (dump_enabled_p ())
> >>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
> >> @@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
> >>           return false;
> >>         }
> >> -
> >> -      /* lane-reducing operations have to go through vect_transform_reduction.
> >> -         For the other cases try without the single cycle optimization.  */
> >> -      if (!ok)
> >> -       {
> >> -         if (lane_reducing)
> >> -           return false;
> >> -         else
> >> -           single_defuse_cycle = false;
> >> -       }
> >>      }
> >>    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
> >>
> >> -  /* If the reduction stmt is one of the patterns that have lane
> >> -     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
> >> -  if ((ncopies > 1 && ! single_defuse_cycle)
> >> -      && lane_reducing)
> >> -    {
> >> -      if (dump_enabled_p ())
> >> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> -                        "multi def-use cycle not possible for lane-reducing "
> >> -                        "reduction operation\n");
> >> -      return false;
> >> -    }
> >> -
> >> -  if (slp_node
> >> -      && !(!single_defuse_cycle
> >> -          && !lane_reducing
> >> -          && reduction_type != FOLD_LEFT_REDUCTION))
> >> +  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
> >> +     below processing will be done in its own vectorizable function.  */
> >> +  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
> >>      for (i = 0; i < (int) op.num_ops; i++)
> >>        if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
> >>         {
> >> @@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >>    vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
> >>                              reduction_type, ncopies, cost_vec);
> >>    /* Cost the reduction op inside the loop if transformed via
> >> -     vect_transform_reduction.  Otherwise this is costed by the
> >> -     separate vectorizable_* routines.  */
> >> -  if (single_defuse_cycle || lane_reducing)
> >> -    {
> >> -      int factor = 1;
> >> -      if (vect_is_emulated_mixed_dot_prod (stmt_info))
> >> -       /* Three dot-products and a subtraction.  */
> >> -       factor = 4;
> >> -      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
> >> -                       stmt_info, 0, vect_body);
> >> -    }
> >> +     vect_transform_reduction for non-lane-reducing operation.  Otherwise
> >> +     this is costed by the separate vectorizable_* routines.  */
> >> +  if (single_defuse_cycle && !lane_reducing)
> >> +    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
> >>
> >>    if (dump_enabled_p ()
> >>        && reduction_type == FOLD_LEFT_REDUCTION)
> >>      dump_printf_loc (MSG_NOTE, vect_location,
> >>                      "using an in-order (fold-left) reduction.\n");
> >>    STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
> >> -  /* All but single defuse-cycle optimized, lane-reducing and fold-left
> >> -     reductions go through their own vectorizable_* routines.  */
> >> -  if (!single_defuse_cycle
> >> -      && !lane_reducing
> >> -      && reduction_type != FOLD_LEFT_REDUCTION)
> >> +
> >> +  /* All but single defuse-cycle optimized and fold-left reductions go
> >> +     through their own vectorizable_* routines.  */
> >> +  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
> >> +      || lane_reducing)
>
> >
> > So single-def-use-cycle but lane-reducing ops no longer need
> > to go through vect_transform_reduction?  How do you handle those
> > but fail to handle non-lane-reducing ops this way?
>
> Emm, all kinds of lane-reducing ops will go into vectorizable_lane_reducing(),
> no matter it is single-def-use or not, at that function, the STMT_VINFO_TYPE
> is set to reduc_vec_info_type, so transform will be done inside
> vect_transform_reduction.
>
> >
> >>      {
> >>        stmt_vec_info tem
> >>         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
> >> @@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> >>    int i;
> >>    int ncopies;
> >> +  int stmt_ncopies;
> >>    int vec_num;
> >>
> >>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> >> @@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
> >>    int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
> >>    tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
> >> +  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
> >> +
> >> +  /* Get input vectypes from the reduction PHI and the statement to be
> >> +     transformed, these two vectypes may have different lanes when
> >> +     lane-reducing operation is present.  */
> >> +  if (!vectype_in)
> >> +    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
> >> +
> >> +  if (!stmt_vectype_in)
> >> +    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
> >>
> >>    if (slp_node)
> >>      {
> >>        ncopies = 1;
> >> +      stmt_ncopies = 1;
> >>        vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> >>      }
> >>    else
> >>      {
> >>        ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> >> +      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
> >> +      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
> >>        vec_num = 1;
> >>      }
> >>
> >> @@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>
> >>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> >>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> >> -  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> >> -
> >> +  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> >> +                                                   stmt_vectype_in);
> >>    /* Transform.  */
> >> -  tree new_temp = NULL_TREE;
> >> -  auto_vec<tree> vec_oprnds0;
> >> -  auto_vec<tree> vec_oprnds1;
> >> -  auto_vec<tree> vec_oprnds2;
> >> -  tree def0;
> >> +  auto_vec<tree> vec_oprnds[3];
> >>
> >>    if (dump_enabled_p ())
> >>      dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
> >> @@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
> >>      }
> >>
> >> -  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >> -
> >>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> >>    if (reduction_type == FOLD_LEFT_REDUCTION)
> >>      {
> >> @@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>        gcc_assert (code.is_tree_code () || cond_fn_p);
> >>        return vectorize_fold_left_reduction
> >>           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> >> -          code, reduc_fn, op.ops, op.num_ops, vectype_in,
> >> +          code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
> >>            reduc_index, masks, lens);
> >>      }
> >>
> >> @@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
> >>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
> >>
> >> -  /* Get NCOPIES vector definitions for all operands except the reduction
> >> -     definition.  */
> >> -  if (!cond_fn_p)
> >> +  gcc_assert (reduc_index < 3);
> >> +
> >> +  if (slp_node)
> >>      {
> >> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> >> -                        single_defuse_cycle && reduc_index == 0
> >> -                        ? NULL_TREE : op.ops[0], &vec_oprnds0,
> >> -                        single_defuse_cycle && reduc_index == 1
> >> -                        ? NULL_TREE : op.ops[1], &vec_oprnds1,
> >> -                        op.num_ops == 3
> >> -                        && !(single_defuse_cycle && reduc_index == 2)
> >> -                        ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> >> +      gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
> >
> > I think that's going to fail.  Mind v3 of the series I posted to enable
> > SLP discovery for single-lane reductions.  Basically everything is
> > going to be SLP for GCC 15.
> >
>
> Have the v3 already been in the trunk? Then by default, any statement that has
> no isomorphic partner will become a single-lane SLP node?  And for such node,
> can I just reuse the old non-SLP transformation code?

As of this morning, r15-1006-gd93353e6423eca, it is on trunk.  Note the fallback
is still non-SLP in case vectorizable_reduction FAILs with SLP.  I have a set of
changes queued to allow some more kind of reductions with SLP but IIRC the
lane-reducing variant is already supported.

Richard.

> >> +
> >> +      for (i = 0; i < (int) op.num_ops; i++)
> >> +       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
> >>      }
> >>    else
> >>      {
> >> -      /* For a conditional operation pass the truth type as mask
> >> -        vectype.  */
> >> -      gcc_assert (single_defuse_cycle
> >> -                 && (reduc_index == 1 || reduc_index == 2));
> >> -      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> >> -                        op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
> >> -                        reduc_index == 1 ? NULL_TREE : op.ops[1],
> >> -                        NULL_TREE, &vec_oprnds1,
> >> -                        reduc_index == 2 ? NULL_TREE : op.ops[2],
> >> -                        NULL_TREE, &vec_oprnds2);
> >> -    }
> >> +      /* The input vectype of the reduction PHI determines copies of
> >> +        vectorized def-use cycles, which might be more than effective copies
> >> +        of vectorized lane-reducing reduction statements.  This could be
> >> +        complemented by generating extra trivial pass-through copies.  For
> >> +        example:
> >> +
> >
> > That also means you need to handle SLP here, but you can assert there's
> > only a single lane.
> >
> > Btw, you can push the patches I approved if they independently test OK.
> >
>
> >> +          int sum = 0;
> >> +          for (i)
> >> +            {
> >> +              sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> >> +              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
> >> +              sum += n[i];               // normal <vector(4) int>
> >> +            }
> >> +
> >> +        The vector size is 128-bit?vectorization factor is 16.  Reduction
> >> +        statements would be transformed as:
> >> +
> >> +          vector<4> int sum_v0 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v1 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v2 = { 0, 0, 0, 0 };
> >> +          vector<4> int sum_v3 = { 0, 0, 0, 0 };
> >> +
> >> +          for (i / 16)
> >> +            {
> >> +              sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> >> +              sum_v1 = sum_v1;  // copy
> >> +              sum_v2 = sum_v2;  // copy
> >> +              sum_v3 = sum_v3;  // copy
> >> +
> >> +              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> >> +              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> >> +              sum_v2 = sum_v2;  // copy
> >> +              sum_v3 = sum_v3;  // copy
> >> +
> >> +              sum_v0 += n_v0[i: 0  ~ 3 ];
> >> +              sum_v1 += n_v1[i: 4  ~ 7 ];
> >> +              sum_v2 += n_v2[i: 8  ~ 11];
> >> +              sum_v3 += n_v3[i: 12 ~ 15];
> >> +            }
> >> +       */
> >> +
> >> +      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
> >> +       {
> >> +         tree vectype = NULL_TREE;
> >> +         int used_ncopies = ncopies;
> >> +
> >> +         if (cond_fn_p && i == 0)
> >> +           {
> >> +             /* For a conditional operation pass the truth type as mask
> >> +                vectype.  */
> >> +             gcc_assert (single_defuse_cycle && reduc_index > 0);
> >> +             vectype = truth_type_for (vectype_in);
> >> +           }
> >>
> >> -  /* For single def-use cycles get one copy of the vectorized reduction
> >> -     definition.  */
> >> -  if (single_defuse_cycle)
> >> -    {
> >> -      gcc_assert (!slp_node);
> >> -      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> >> -                                    op.ops[reduc_index],
> >> -                                    reduc_index == 0 ? &vec_oprnds0
> >> -                                    : (reduc_index == 1 ? &vec_oprnds1
> >> -                                       : &vec_oprnds2));
> >> +         if (i != reduc_index)
> >> +           {
> >> +             /* For non-reduction operand, deduce effictive copies that are
> >> +                involved in vectorized def-use cycles based on the input
> >> +                vectype of the reduction statement.  */
> >> +             used_ncopies = stmt_ncopies;
> >> +           }
> >> +         else if (single_defuse_cycle)
> >> +           {
> >> +             /* For single def-use cycles get one copy of the vectorized
> >> +                reduction definition.  */
> >> +             used_ncopies = 1;
> >> +           }
> >> +
> >> +         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
> >> +                                        op.ops[i], &vec_oprnds[i], vectype);
> >> +
> >> +         if (used_ncopies < ncopies)
> >> +           vec_oprnds[i].safe_grow_cleared (ncopies);
> >> +       }
> >>      }
> >>
> >> +  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> >>    bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> >> +  tree def0;
> >>
> >> -  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
> >> +  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
> >>      {
> >>        gimple *new_stmt;
> >> -      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
> >> -      if (masked_loop_p && !mask_by_cond_expr)
> >> +      tree new_temp = NULL_TREE;
> >> +      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
> >> +
> >> +      if (!vop[0] || !vop[1])
> >> +       {
> >> +         tree reduc_vop = vec_oprnds[reduc_index][i];
> >> +
> >> +         /* Insert trivial copy if no need to generate vectorized
> >> +            statement.  */
> >> +         gcc_assert (reduc_vop && stmt_ncopies < ncopies);
> >> +
> >> +         new_stmt = gimple_build_assign (vec_dest, reduc_vop);
> >> +         new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +         gimple_set_lhs (new_stmt, new_temp);
> >> +         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> >> +       }
> >> +      else if (masked_loop_p && !mask_by_cond_expr)
> >>         {
> >> -         /* No conditional ifns have been defined for dot-product yet.  */
> >> -         gcc_assert (code != DOT_PROD_EXPR);
> >> +         /* No conditional ifns have been defined for dot-product and sad
> >> +            yet.  */
> >> +         gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
> >>
> >>           /* Make sure that the reduction accumulator is vop[0].  */
> >>           if (reduc_index == 1)
> >> @@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>               std::swap (vop[0], vop[1]);
> >>             }
> >>           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> >> -                                         vec_num * ncopies, vectype_in, i);
> >> +                                         vec_num * stmt_ncopies,
> >> +                                         stmt_vectype_in, i);
> >>           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
> >>                                                     vop[0], vop[1], vop[0]);
> >>           new_temp = make_ssa_name (vec_dest, call);
> >> @@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>        else
> >>         {
> >>           if (op.num_ops >= 3)
> >> -           vop[2] = vec_oprnds2[i];
> >> +           vop[2] = vec_oprnds[2][i];
> >>
> >>           if (masked_loop_p && mask_by_cond_expr)
> >>             {
> >>               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> >> -                                             vec_num * ncopies, vectype_in, i);
> >> +                                             vec_num * stmt_ncopies,
> >> +                                             stmt_vectype_in, i);
> >>               build_vect_cond_expr (code, vop, mask, gsi);
> >>             }
> >>
> >> @@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> >>
> >>        if (slp_node)
> >>         slp_node->push_vec_def (new_stmt);
> >> -      else if (single_defuse_cycle
> >> -              && i < ncopies - 1)
> >> -       {
> >> -         if (reduc_index == 0)
> >> -           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
> >> -         else if (reduc_index == 1)
> >> -           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
> >> -         else if (reduc_index == 2)
> >> -           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
> >> -       }
> >> +      else if (single_defuse_cycle && i < ncopies - 1)
> >> +       vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
> >>        else
> >>         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >>      }
> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> >> index 2e0be763abb..cc0a832f71b 100644
> >> --- a/gcc/tree-vect-stmts.cc
> >> +++ b/gcc/tree-vect-stmts.cc
> >> @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo,
> >>                                       NULL, NULL, node, cost_vec)
> >>           || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> >>           || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> >> +         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
> >> +                                        stmt_info, node, cost_vec)
> >>           || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
> >>                                      node, node_instance, cost_vec)
> >>           || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
> >> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> >> index 97ec9c341e7..ca810869592 100644
> >> --- a/gcc/tree-vectorizer.h
> >> +++ b/gcc/tree-vectorizer.h
> >> @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
> >>  extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
> >>                                          slp_tree, slp_instance, int,
> >>                                          bool, stmt_vector_for_cost *);
> >> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
> >> +                                       slp_tree, stmt_vector_for_cost *);
> >>  extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
> >>                                     slp_tree, slp_instance,
> >>                                     stmt_vector_for_cost *);
> >> --
> >> 2.17.1

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0002-vect-Support-multiple-lane-reducing-operations-for-l.patch --]
[-- Type: text/x-patch; name="0002-vect-Support-multiple-lane-reducing-operations-for-l.patch", Size: 50203 bytes --]

From 2240426bd61a91bb1b8d10bf74fb78b1a91f3107 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Wed, 29 May 2024 17:22:36 +0800
Subject: [PATCH 2/3] vect: Support multiple lane-reducing operations for loop
 reduction [PR114440]

For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
vectorizer could only handle the pattern if the reduction chain does not
contain other operation, no matter the other is normal or lane-reducing.

Actually, to allow multiple arbitray lane-reducing operations, we need to
support vectorization of loop reduction chain with mixed input vectypes. Since
lanes of vectype may vary with operation, the effective ncopies of vectorized
statements for operation also may not be same to each other, this causes
mismatch on vectorized def-use cycles. A simple way is to align all operations
with the one that has the most ncopies, the gap could be complemented by
generating extra trival pass-through copies. For example:

   int sum = 0;
   for (i)
     {
       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
       sum += w[i];               // widen-sum <vector(16) char>
       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
       sum += n[i];               // normal <vector(4) int>
     }

The vector size is 128-bit vectorization factor is 16. Reduction statements
would be transformed as:

   vector<4> int sum_v0 = { 0, 0, 0, 0 };
   vector<4> int sum_v1 = { 0, 0, 0, 0 };
   vector<4> int sum_v2 = { 0, 0, 0, 0 };
   vector<4> int sum_v3 = { 0, 0, 0, 0 };

   for (i / 16)
     {
       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
       sum_v1 = sum_v1;  // copy
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
       sum_v2 = sum_v2;  // copy
       sum_v3 = sum_v3;  // copy

       sum_v0 += n_v0[i: 0  ~ 3 ];
       sum_v1 += n_v1[i: 4  ~ 7 ];
       sum_v2 += n_v2[i: 8  ~ 11];
       sum_v3 += n_v3[i: 12 ~ 15];
     }

2024-03-22 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	PR tree-optimization/114440
	* tree-vectorizer.h (vectorizable_lane_reducing): New function
	declaration.
	* tree-vect-stmts.cc (vect_analyze_stmt): Call new function
	vectorizable_lane_reducing to analyze lane-reducing operation.
	* tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
	code related to	emulated_mixed_dot_prod.
	(vect_reduction_update_partial_vector_usage): Compute ncopies as the
	original means for single-lane slp node.
	(vectorizable_lane_reducing): New function.
	(vectorizable_reduction): Allow multiple lane-reducing operations in
	loop reduction. Move some original lane-reducing related code to
	vectorizable_lane_reducing.
	(vect_transform_reduction): Extend transformation to support reduction
	statements with mixed input vectypes.

gcc/testsuite/
	PR tree-optimization/114440
	* gcc.dg/vect/vect-reduc-chain-1.c
	* gcc.dg/vect/vect-reduc-chain-2.c
	* gcc.dg/vect/vect-reduc-chain-3.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
	* gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
	* gcc.dg/vect/vect-reduc-dot-slp-1.c

temp
---
 .../gcc.dg/vect/vect-reduc-chain-1.c          |  62 +++
 .../gcc.dg/vect/vect-reduc-chain-2.c          |  77 +++
 .../gcc.dg/vect/vect-reduc-chain-3.c          |  66 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  95 ++++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  67 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c  |  79 +++
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c  |  63 +++
 .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  35 ++
 gcc/tree-vect-loop.cc                         | 501 ++++++++++++------
 gcc/tree-vect-stmts.cc                        |   2 +
 gcc/tree-vectorizer.h                         |   2 +
 11 files changed, 888 insertions(+), 161 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
new file mode 100644
index 00000000000..04bfc419dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_2 char *restrict c,
+   SIGNEDNESS_2 char *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_2 char c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = BASE + i * 2;
+      d[i] = BASE + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
new file mode 100644
index 00000000000..6c803b80120
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
@@ -0,0 +1,77 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 char *restrict c,
+   SIGNEDNESS_3 char *restrict d,
+   SIGNEDNESS_4 short *restrict e,
+   SIGNEDNESS_4 short *restrict f,
+   SIGNEDNESS_1 int *restrict g)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      res += a[i] * b[i];
+      res += i + 1;
+      res += c[i] * d[i];
+      res += e[i] * f[i];
+      res += g[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 char c[N], d[N];
+  SIGNEDNESS_4 short e[N], f[N];
+  SIGNEDNESS_1 int g[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+      asm volatile ("" ::: "memory");
+      expected += a[i] * b[i];
+      expected += i + 1;
+      expected += c[i] * d[i];
+      expected += e[i] * f[i];
+      expected += g[i];
+    }
+  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
new file mode 100644
index 00000000000..a41e4b176c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
@@ -0,0 +1,66 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *restrict a,
+   SIGNEDNESS_2 char *restrict b,
+   SIGNEDNESS_3 short *restrict c,
+   SIGNEDNESS_3 short *restrict d,
+   SIGNEDNESS_1 int *restrict e)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      res += abs;
+      res += c[i] * d[i];
+      res += e[i];
+    }
+  return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[N], b[N];
+  SIGNEDNESS_3 short c[N], d[N];
+  SIGNEDNESS_1 int e[N];
+  int expected = 0x12345;
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 - i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = i;
+      asm volatile ("" ::: "memory");
+      short diff = a[i] - b[i];
+      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+      expected += abs;
+      expected += c[i] * d[i];
+      expected += e[i];
+    }
+  if (f (0x12345, a, b, c, d, e) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
new file mode 100644
index 00000000000..c2831fbcc8e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
@@ -0,0 +1,95 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+      res += a[8] * b[8];
+      res += a[9] * b[9];
+      res += a[10] * b[10];
+      res += a[11] * b[11];
+      res += a[12] * b[12];
+      res += a[13] * b[13];
+      res += a[14] * b[14];
+      res += a[15] * b[15];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int step = 16;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      expected += a[t + 8] * b[t + 8];
+      expected += a[t + 9] * b[t + 9];
+      expected += a[t + 10] * b[t + 10];
+      expected += a[t + 11] * b[t + 11];
+      expected += a[t + 12] * b[t + 12];
+      expected += a[t + 13] * b[t + 13];
+      expected += a[t + 14] * b[t + 14];
+      expected += a[t + 15] * b[t + 15];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
new file mode 100644
index 00000000000..4114264a364
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
@@ -0,0 +1,67 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 char *a,
+   SIGNEDNESS_2 char *b,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[5 * i + 0] * b[5 * i + 0];
+      res += a[5 * i + 1] * b[5 * i + 1];
+      res += a[5 * i + 2] * b[5 * i + 2];
+      res += a[5 * i + 3] * b[5 * i + 3];
+      res += a[5 * i + 4] * b[5 * i + 4];
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 char a[100], b[100];
+  int expected = 0x12345;
+  int n = 18;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[5 * i + 0] * b[5 * i + 0];
+      expected += a[5 * i + 1] * b[5 * i + 1];
+      expected += a[5 * i + 2] * b[5 * i + 2];
+      expected += a[5 * i + 3] * b[5 * i + 3];
+      expected += a[5 * i + 4] * b[5 * i + 4];
+    }
+
+  if (f (0x12345, a, b, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 5 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
new file mode 100644
index 00000000000..2cdecc36d16
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
@@ -0,0 +1,79 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int step, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[0] * b[0];
+      res += a[1] * b[1];
+      res += a[2] * b[2];
+      res += a[3] * b[3];
+      res += a[4] * b[4];
+      res += a[5] * b[5];
+      res += a[6] * b[6];
+      res += a[7] * b[7];
+
+      a += step;
+      b += step;
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int step = 8;
+  int n = 2;
+  int t = 0;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[t + 0] * b[t + 0];
+      expected += a[t + 1] * b[t + 1];
+      expected += a[t + 2] * b[t + 2];
+      expected += a[t + 3] * b[t + 3];
+      expected += a[t + 4] * b[t + 4];
+      expected += a[t + 5] * b[t + 5];
+      expected += a[t + 6] * b[t + 6];
+      expected += a[t + 7] * b[t + 7];
+      t += step;
+    }
+
+  if (f (0x12345, a, b, step, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
new file mode 100644
index 00000000000..32c0f30c77b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
@@ -0,0 +1,63 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      res += a[3 * i + 0] * b[3 * i + 0];
+      res += a[3 * i + 1] * b[3 * i + 1];
+      res += a[3 * i + 2] * b[3 * i + 2];
+    }
+
+  return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS_2 short a[100], b[100];
+  int expected = 0x12345;
+  int n = 18;
+
+  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+
+  for (int i = 0; i < n; i++)
+    {
+      asm volatile ("" ::: "memory");
+      expected += a[3 * i + 0] * b[3 * i + 0];
+      expected += a[3 * i + 1] * b[3 * i + 1];
+      expected += a[3 * i + 2] * b[3 * i + 2];
+    }
+
+  if (f (0x12345, a, b, n) != expected)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 3 "vect"  { target vect_sdot_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
new file mode 100644
index 00000000000..e17d6291f75
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
@@ -0,0 +1,35 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res0,
+   SIGNEDNESS_1 int res1,
+   SIGNEDNESS_1 int res2,
+   SIGNEDNESS_1 int res3,
+   SIGNEDNESS_2 short *a,
+   SIGNEDNESS_2 short *b)
+{
+  for (int i = 0; i < 64; i += 4)
+    {
+      res0 += a[i + 0] * b[i + 0];
+      res1 += a[i + 1] * b[i + 1];
+      res2 += a[i + 2] * b[i + 2];
+      res3 += a[i + 3] * b[i + 3];
+    }
+
+  return res0 ^ res1 ^ res2 ^ res3;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 35c50eb72cb..fb9259d115c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5324,8 +5324,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
-  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -5360,12 +5358,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 	   initial result of the data reduction, initial value of the index
 	   reduction.  */
 	prologue_stmts = 4;
-      else if (emulated_mixed_dot_prod)
-	/* We need the initial reduction value and two invariants:
-	   one that contains the minimum signed value and one that
-	   contains half of its negative.  */
-	prologue_stmts = 3;
       else
+	/* We need the initial reduction value.  */
 	prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
 					 scalar_to_vec, stmt_info, 0,
@@ -7466,7 +7460,7 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
       unsigned nvectors;
 
-      if (slp_node)
+      if (slp_node && SLP_TREE_LANES (slp_node) > 1)
 	nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       else
 	nvectors = vect_get_num_copies (loop_vinfo, vectype_in);
@@ -7478,6 +7472,150 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
     }
 }
 
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
+   Now there are three such kinds of operations: dot-prod/widen-sum/sad
+   (sum-of-absolute-differences).
+
+   For a lane-reducing operation, the loop reduction path that it lies in,
+   may contain normal operation, or other lane-reducing operation of different
+   input type size, an example as:
+
+     int sum = 0;
+     for (i)
+       {
+         ...
+         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
+         sum += w[i];                // widen-sum <vector(16) char>
+         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
+         sum += n[i];                // normal <vector(4) int>
+         ...
+       }
+
+   Vectorization factor is essentially determined by operation whose input
+   vectype has the most lanes ("vector(16) char" in the example), while we
+   need to choose input vectype with the least lanes ("vector(4) int" in the
+   example) for the reduction PHI statement.  */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+			    slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!stmt)
+    return false;
+
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+
+  if (!lane_reducing_op_p (code))
+    return false;
+
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
+    return false;
+
+  /* Do not try to vectorize bit-precision reductions.  */
+  if (!type_has_mode_precision_p (type))
+    return false;
+
+  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+    {
+      stmt_vec_info def_stmt_info;
+      slp_tree slp_op;
+      tree op;
+      tree vectype;
+      enum vect_def_type dt;
+
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+			       &slp_op, &dt, &vectype, &def_stmt_info))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "use not simple.\n");
+	  return false;
+	}
+
+      if (!vectype)
+	{
+	  vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+						 slp_op);
+	  if (!vectype)
+	    return false;
+	}
+
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "incompatible vector types for invariants\n");
+	  return false;
+	}
+
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+	continue;
+
+      /* There should be at most one cycle def in the stmt.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
+	return false;
+    }
+
+  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+  /* TODO: Support lane-reducing operation that does not directly participate
+     in loop reduction. */
+  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+    return false;
+
+  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+     recoginized.  */
+  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+  int ncopies_for_cost;
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
+    {
+      /* Now lane-reducing operations in a non-single-lane slp node should only
+	 come from the same loop reduction path.  */
+      gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
+      ncopies_for_cost = 1;
+    }
+  else
+    {
+      ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
+      gcc_assert (ncopies_for_cost >= 1);
+    }
+
+  if (vect_is_emulated_mixed_dot_prod (stmt_info))
+    {
+      /* We need extra two invariants: one that contains the minimum signed
+	 value and one that contains half of its negative.  */
+      int prologue_stmts = 2;
+      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+					scalar_to_vec, stmt_info, 0,
+					vect_prologue);
+      if (dump_enabled_p ())
+	dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+		     "extra prologue_cost = %d .\n", cost);
+
+      /* Three dot-products and a subtraction.  */
+      ncopies_for_cost *= 4;
+    }
+
+  record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
+		    vect_body);
+
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+    vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
+						slp_node, code, type,
+						vectype_in);
+
+  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+  return true;
+}
+
 /* Function vectorizable_reduction.
 
    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7643,7 +7781,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
-      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
+      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
+
+      if (reduc_idx == -1)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7689,10 +7829,43 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	      return false;
 	    }
 	}
-      else if (!stmt_info)
-	/* First non-conversion stmt.  */
-	stmt_info = vdef;
-      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
+      else
+	{
+	  /* First non-conversion stmt.  */
+	  if (!stmt_info)
+	    stmt_info = vdef;
+
+	  if (lane_reducing_op_p (op.code))
+	    {
+	      unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
+	      tree op_type = TREE_TYPE (op.ops[0]);
+	      tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
+								 op_type,
+								 group_size);
+
+	      /* The last operand of lane-reducing operation must be addend
+		 for reduction.  */
+	      gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
+
+	      if (!new_vectype_in)
+		return false;
+
+	      STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
+
+	      /* To accommodate lane-reducing operations of mixed input
+		 vectypes, choose input vectype with the least lanes for the
+		 reduction PHI statement, which would result in the most
+		 ncopies for vectorized reduction results.  */
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		       < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
+		vectype_in = new_vectype_in;
+	    }
+	  else
+	    vectype_in = STMT_VINFO_VECTYPE (phi_info);
+	}
+
+      reduc_def = op.ops[reduc_idx];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
 	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
@@ -7750,6 +7923,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7763,18 +7938,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (!type_has_mode_precision_p (op.type))
     return false;
 
-  /* For lane-reducing ops we're reducing the number of reduction PHIs
-     which means the only use of that may be in the lane-reducing operation.  */
-  if (lane_reducing
-      && reduc_chain_length != 1
-      && !only_slp_reduc_chain)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "lane-reducing reduction with extra stmts.\n");
-      return false;
-    }
-
   /* Lane-reducing ops also never can be used in a SLP reduction group
      since we'll mix lanes belonging to different reductions.  But it's
      OK to use them in a reduction chain or when the reduction group
@@ -7818,9 +7981,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 			     "use not simple.\n");
 	  return false;
 	}
-      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
-	continue;
-
       /* For an IFN_COND_OP we might hit the reduction definition operand
 	 twice (once as definition, once as else).  */
       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
@@ -7836,16 +7996,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  = get_vectype_for_scalar_type (loop_vinfo,
 					 TREE_TYPE (op.ops[i]), slp_op[i]);
 
-      /* To properly compute ncopies we are interested in the widest
-	 non-reduction input type in case we're looking at a widening
-	 accumulation that we later handle in vect_transform_reduction.  */
-      if (lane_reducing
-	  && vectype_op[i]
-	  && (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
-	vectype_in = vectype_op[i];
-
       /* Record how the non-reduction-def value of COND_EXPR is defined.
 	 ???  For a chain of multiple CONDs we'd have to match them up all.  */
       if (op.code == COND_EXPR && reduc_chain_length == 1)
@@ -7864,19 +8014,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    }
 	}
     }
-  if (!vectype_in)
-    vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
-
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
 
-  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
-  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
-  if (v_reduc_type == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       if (slp_node && SLP_TREE_LANES (slp_node) != 1)
 	return false;
@@ -8042,8 +8184,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
 
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
+  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
 
-  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
       /* Check whether it's ok to change the order of the computation.
@@ -8329,14 +8471,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;
 
-  if (single_defuse_cycle || lane_reducing)
+  if (single_defuse_cycle && !lane_reducing)
     {
       gcc_assert (op.code != COND_EXPR);
 
-      /* 4. Supportable by target?  */
-      bool ok = true;
-
-      /* 4.1. check support for the operation in the loop
+      /* 4. check support for the operation in the loop
 
 	 This isn't necessary for the lane reduction codes, since they
 	 can only be produced by pattern matching, and it's up to the
@@ -8345,14 +8484,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	 mixed-sign dot-products can be implemented using signed
 	 dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!lane_reducing
-	  && !directly_supported_p (op.code, vectype_in, optab_vector))
+      if (!directly_supported_p (op.code, vectype_in, optab_vector))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
 	      || !vect_can_vectorize_without_simd_p (op.code))
-	    ok = false;
+	    single_defuse_cycle = false;
 	  else
 	    if (dump_enabled_p ())
 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8365,35 +8503,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
 	  return false;
 	}
-
-      /* lane-reducing operations have to go through vect_transform_reduction.
-         For the other cases try without the single cycle optimization.  */
-      if (!ok)
-	{
-	  if (lane_reducing)
-	    return false;
-	  else
-	    single_defuse_cycle = false;
-	}
     }
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
 
-  /* If the reduction stmt is one of the patterns that have lane
-     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1 && ! single_defuse_cycle)
-      && lane_reducing)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "multi def-use cycle not possible for lane-reducing "
-			 "reduction operation\n");
-      return false;
-    }
-
-  if (slp_node
-      && !(!single_defuse_cycle
-	   && !lane_reducing
-	   && reduction_type != FOLD_LEFT_REDUCTION))
+  /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
+     below processing will be done in its own vectorizable function.  */
+  if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
     for (i = 0; i < (int) op.num_ops; i++)
       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
 	{
@@ -8406,28 +8521,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
 			     reduction_type, ncopies, cost_vec);
   /* Cost the reduction op inside the loop if transformed via
-     vect_transform_reduction.  Otherwise this is costed by the
-     separate vectorizable_* routines.  */
-  if (single_defuse_cycle || lane_reducing)
-    {
-      int factor = 1;
-      if (vect_is_emulated_mixed_dot_prod (stmt_info))
-	/* Three dot-products and a subtraction.  */
-	factor = 4;
-      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
-			stmt_info, 0, vect_body);
-    }
+     vect_transform_reduction for non-lane-reducing operation.  Otherwise
+     this is costed by the separate vectorizable_* routines.  */
+  if (single_defuse_cycle && !lane_reducing)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "using an in-order (fold-left) reduction.\n");
   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
-  /* All but single defuse-cycle optimized, lane-reducing and fold-left
-     reductions go through their own vectorizable_* routines.  */
-  if (!single_defuse_cycle
-      && !lane_reducing
-      && reduction_type != FOLD_LEFT_REDUCTION)
+
+  /* All but single defuse-cycle optimized and fold-left reductions go
+     through their own vectorizable_* routines.  */
+  if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
+      || lane_reducing)
     {
       stmt_vec_info tem
 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8533,6 +8641,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
+  int stmt_ncopies;
   int vec_num;
 
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -8556,15 +8665,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+  tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
 
-  if (slp_node)
+  /* Get input vectypes from the reduction PHI and the statement to be
+     transformed, these two vectypes may have different lanes when
+     lane-reducing operation is present.  */
+  if (!vectype_in)
+    vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
+
+  if (!stmt_vectype_in)
+    stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
     {
       ncopies = 1;
+      stmt_ncopies = 1;
       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
     }
   else
     {
       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+      stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
+      gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
       vec_num = 1;
     }
 
@@ -8573,14 +8695,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
-  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
-
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
+						    stmt_vectype_in);
   /* Transform.  */
-  tree new_temp = NULL_TREE;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
-  tree def0;
+  auto_vec<tree> vec_oprnds[3];
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8604,8 +8722,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 		      == op.ops[internal_fn_else_index ((internal_fn) code)]));
     }
 
-  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
-
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -8613,7 +8729,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
 	   reduc_index, masks, lens);
     }
 
@@ -8624,55 +8740,124 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
-  /* Get NCOPIES vector definitions for all operands except the reduction
-     definition.  */
-  if (!cond_fn_p)
+  gcc_assert (reduc_index < 3);
+
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
     {
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 single_defuse_cycle && reduc_index == 0
-			 ? NULL_TREE : op.ops[0], &vec_oprnds0,
-			 single_defuse_cycle && reduc_index == 1
-			 ? NULL_TREE : op.ops[1], &vec_oprnds1,
-			 op.num_ops == 3
-			 && !(single_defuse_cycle && reduc_index == 2)
-			 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+      gcc_assert (!single_defuse_cycle);
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+	vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
     }
   else
     {
-      /* For a conditional operation pass the truth type as mask
-	 vectype.  */
-      gcc_assert (single_defuse_cycle
-		  && (reduc_index == 1 || reduc_index == 2));
-      vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
-			 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
-			 reduc_index == 1 ? NULL_TREE : op.ops[1],
-			 NULL_TREE, &vec_oprnds1,
-			 reduc_index == 2 ? NULL_TREE : op.ops[2],
-			 NULL_TREE, &vec_oprnds2);
-    }
+      /* The input vectype of the reduction PHI determines copies of
+	 vectorized def-use cycles, which might be more than effective copies
+	 of vectorized lane-reducing reduction statements.  This could be
+	 complemented by generating extra trivial pass-through copies.  For
+	 example:
+
+	   int sum = 0;
+	   for (i)
+	     {
+	       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
+	       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+	       sum += n[i];               // normal <vector(4) int>
+	     }
+
+	 The vector size is 128-bit，vectorization factor is 16.  Reduction
+	 statements would be transformed as:
+
+	   vector<4> int sum_v0 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v1 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v2 = { 0, 0, 0, 0 };
+	   vector<4> int sum_v3 = { 0, 0, 0, 0 };
+
+	   for (i / 16)
+	     {
+	       sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
+	       sum_v1 = sum_v1;  // copy
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+	       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 += n_v0[i: 0  ~ 3 ];
+	       sum_v1 += n_v1[i: 4  ~ 7 ];
+	       sum_v2 += n_v2[i: 8  ~ 11];
+	       sum_v3 += n_v3[i: 12 ~ 15];
+	     }
+	*/
+
+      for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+	{
+	  tree vectype = NULL_TREE;
+	  int used_ncopies = ncopies;
+
+	  if (cond_fn_p && i == 0)
+	    {
+	      /* For a conditional operation pass the truth type as mask
+		 vectype.  */
+	      gcc_assert (single_defuse_cycle && reduc_index > 0);
+	      vectype = truth_type_for (vectype_in);
+	    }
 
-  /* For single def-use cycles get one copy of the vectorized reduction
-     definition.  */
-  if (single_defuse_cycle)
-    {
-      gcc_assert (!slp_node);
-      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-				     op.ops[reduc_index],
-				     reduc_index == 0 ? &vec_oprnds0
-				     : (reduc_index == 1 ? &vec_oprnds1
-					: &vec_oprnds2));
+	  if (i != reduc_index)
+	    {
+	      /* For non-reduction operand, deduce effictive copies that are
+		 involved in vectorized def-use cycles based on the input
+		 vectype of the reduction statement.  */
+	      used_ncopies = stmt_ncopies;
+	    }
+	  else if (single_defuse_cycle)
+	    {
+	      /* For single def-use cycles get one copy of the vectorized
+		 reduction definition.  */
+	      used_ncopies = 1;
+	    }
+
+	  if (slp_node)
+	    vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
+	  else
+	    vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
+					   op.ops[i], &vec_oprnds[i], vectype);
+
+	  if (used_ncopies < ncopies)
+	    vec_oprnds[i].safe_grow_cleared (ncopies);
+	}
     }
 
+  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+  tree def0;
 
-  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+  FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
     {
       gimple *new_stmt;
-      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-      if (masked_loop_p && !mask_by_cond_expr)
+      tree new_temp = NULL_TREE;
+      tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
+
+      if (!vop[0] || !vop[1])
+	{
+	  tree reduc_vop = vec_oprnds[reduc_index][i];
+
+	  /* Insert trivial copy if no need to generate vectorized
+	     statement.  */
+	  gcc_assert (reduc_vop && stmt_ncopies < ncopies);
+
+	  new_stmt = gimple_build_assign (vec_dest, reduc_vop);
+	  new_temp = make_ssa_name (vec_dest, new_stmt);
+	  gimple_set_lhs (new_stmt, new_temp);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+	}
+      else if (masked_loop_p && !mask_by_cond_expr)
 	{
-	  /* No conditional ifns have been defined for dot-product yet.  */
-	  gcc_assert (code != DOT_PROD_EXPR);
+	  /* No conditional ifns have been defined for dot-product and sad
+	     yet.  */
+	  gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
 
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)
@@ -8681,7 +8866,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	      std::swap (vop[0], vop[1]);
 	    }
 	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype_in, i);
+					  vec_num * stmt_ncopies,
+					  stmt_vectype_in, i);
 	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
 						    vop[0], vop[1], vop[0]);
 	  new_temp = make_ssa_name (vec_dest, call);
@@ -8693,12 +8879,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
       else
 	{
 	  if (op.num_ops >= 3)
-	    vop[2] = vec_oprnds2[i];
+	    vop[2] = vec_oprnds[2][i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
 	    {
 	      tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					      vec_num * ncopies, vectype_in, i);
+					      vec_num * stmt_ncopies,
+					      stmt_vectype_in, i);
 	      build_vect_cond_expr (code, vop, mask, gsi);
 	    }
 
@@ -8725,16 +8912,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
       if (slp_node)
 	slp_node->push_vec_def (new_stmt);
-      else if (single_defuse_cycle
-	       && i < ncopies - 1)
-	{
-	  if (reduc_index == 0)
-	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 1)
-	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
-	  else if (reduc_index == 2)
-	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
-	}
+      else if (single_defuse_cycle && i < ncopies - 1)
+	vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
       else
 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index dbdb59054e0..81036235a27 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13357,6 +13357,8 @@ vect_analyze_stmt (vec_info *vinfo,
 				      NULL, NULL, node, cost_vec)
 	  || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+	  || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+					 stmt_info, node, cost_vec)
 	  || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
 				     node, node_instance, cost_vec)
 	  || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6bb0f5c3a56..3f7db707d97 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
 extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
 					 slp_tree, slp_instance, int,
 					 bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+					slp_tree, stmt_vector_for_cost *);
 extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
 				    slp_tree, slp_instance,
 				    stmt_vector_for_cost *);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-06-14  4:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-30 14:54 [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440] Feng Xue OS
2024-05-31 14:57 ` Richard Biener
2024-06-02 14:13   ` Feng Xue OS
2024-06-04 13:17     ` Richard Biener
2024-06-14  4:00       ` Feng Xue OS

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).