[PATCH 00/10] vect: Reuse reduction accumulators between loops

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 00/10] vect: Reuse reduction accumulators between loops
@ 2021-07-08 12:38 Richard Sandiford
  2021-07-08 12:39 ` [PATCH 01/10] vect: Simplify epilogue reduction code Richard Sandiford
                   ` (10 more replies)
  0 siblings, 11 replies; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:38 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3764 bytes --]

Quoting from the final patch in the series:

------------------------------------------------------------------------
This patch adds support for reusing a main loop's reduction accumulator
in an epilogue loop.  This in turn lets the loops share a single piece
of vector->scalar reduction code.

The patch has the following restrictions:

(1) The epilogue reduction can only operate on a single vector
    (e.g. ncopies must be 1 for non-SLP reductions, and the group size
    must be <= the element count for SLP reductions).

(2) Both loops must use the same vector mode for their accumulators.
    This means that the patch is restricted to targets that support
    --param vect-partial-vector-usage=1.

(3) The reduction must be a standard “tree code” reduction.

However, these restrictions could be lifted in future.  For example,
if the main loop operates on 128-bit vectors and the epilogue loop
operates on 64-bit vectors, we could in future reduce the 128-bit
vector by one stage and use the 64-bit result as the starting point
for the epilogue result.

The patch tries to handle chained SLP reductions, unchained SLP
reductions and non-SLP reductions.  It also handles cases in which
the epilogue loop is entered directly (rather than via the main loop)
and cases in which the epilogue loop can be skipped.
------------------------------------------------------------------------

However, it ended up being difficult to do that without some preparatory
clean-ups.  Some of them could probably stand on their own, but others
are a bit “meh” without the final patch to justify them.

The diff below shows the effect of the patch when compiling:

  unsigned short __attribute__((noipa))
  add_loop (unsigned short *x, int n)
  {
    unsigned short res = 0;
    for (int i = 0; i < n; ++i)
      res += x[i];
    return res;
  }

with -O3 --param vect-partial-vector-usage=1 on an SVE target:

add_loop:				add_loop:
.LFB0:					.LFB0:
	.cfi_startproc				.cfi_startproc
	mov	x4, x0		      <
	cmp	w1, 0				cmp	w1, 0
	ble	.L7				ble	.L7
	cnth	x0		      |		cnth	x4
	sub	w2, w1, #1			sub	w2, w1, #1
	sub	w3, w0, #1	      |		sub	w3, w4, #1
	cmp	w2, w3				cmp	w2, w3
	bcc	.L8				bcc	.L8
	sub	w0, w1, w0	      |		sub	w4, w1, w4
	mov	x3, 0				mov	x3, 0
	cnth	x5				cnth	x5
	mov	z0.b, #0			mov	z0.b, #0
	ptrue	p0.b, all			ptrue	p0.b, all
	.p2align 3,,7				.p2align 3,,7
.L4:					.L4:
	ld1h	z1.h, p0/z, [x4, x3,  |		ld1h	z1.h, p0/z, [x0, x3, 
	mov	x2, x3				mov	x2, x3
	add	x3, x3, x5			add	x3, x3, x5
	add	z0.h, z0.h, z1.h		add	z0.h, z0.h, z1.h
	cmp	w0, w3		      |		cmp	w4, w3
	bcs	.L4				bcs	.L4
	uaddv	d0, p0, z0.h	      <
	umov	w0, v0.h[0]	      <
	inch	x2				inch	x2
	and	w0, w0, 65535	      <
	cmp	w1, w2				cmp	w1, w2
	beq	.L2		      |		beq	.L6
.L3:					.L3:
	sub	w1, w1, w2			sub	w1, w1, w2
	mov	z1.b, #0	      |		add	x2, x0, w2, uxtw 1
	whilelo	p0.h, wzr, w1			whilelo	p0.h, wzr, w1
	add	x2, x4, w2, uxtw 1    |		ld1h	z1.h, p0/z, [x2]
	ptrue	p1.b, all	      |		add	z0.h, p0/m, z0.h, z1.
	ld1h	z0.h, p0/z, [x2]      |	.L6:
	sel	z0.h, p0, z0.h, z1.h  |		ptrue	p0.b, all
	uaddv	d0, p1, z0.h	      |		uaddv	d0, p0, z0.h
	fmov	x1, d0		      |		umov	w0, v0.h[0]
	add	w0, w0, w1, uxth      <
	and	w0, w0, 65535			and	w0, w0, 65535
.L2:				      <
	ret					ret
	.p2align 2,,3				.p2align 2,,3
.L7:					.L7:
	mov	w0, 0				mov	w0, 0
	ret					ret
.L8:					.L8:
	mov	w2, 0				mov	w2, 0
	mov	w0, 0		      |		mov	z0.b, #0
	b	.L3				b	.L3
	.cfi_endproc				.cfi_endproc

Kewen, could you give this a spin on Power 10 to see whether it
works/helps there?  I've attached a combined diff.

Series tested on aarch64-linux-gnu and x86_64-linux-gnu.

Richard



[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: combined.diff --]
[-- Type: text/x-diff, Size: 87151 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
new file mode 100644
index 00000000000..fb817b73d77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
new file mode 100644
index 00000000000..1dd579be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_10.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0) != 0
+      || add_loop (x, 11) != 572
+      || add_loop (x, 0x100) != 22016
+      || add_loop (x, 0xfff) != 20480
+      || max_loop (x, 0) != 0
+      || max_loop (x, 11) != 132
+      || max_loop (x, 0x100) != 65280
+      || max_loop (x, 0xfff) != 65504
+      || or_loop (x, 0) != 0
+      || or_loop (x, 11) != 0xfe
+      || or_loop (x, 0x80) != 0x7ffe
+      || or_loop (x, 0xb4) != 0x7ffe
+      || or_loop (x, 0xb5) != 0xfffe
+      || eor_loop (x, 0) != 0
+      || eor_loop (x, 11) != 0xe8
+      || eor_loop (x, 0x100) != 0xcf00
+      || eor_loop (x, 0xfff) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0) != 65535
+      || min_loop (x, 11) != 65403
+      || min_loop (x, 0x100) != 255
+      || min_loop (x, 0xfff) != 31
+      || and_loop (x, 0) != 0xffff
+      || and_loop (x, 11) != 0xff01
+      || and_loop (x, 0x80) != 0x8001
+      || and_loop (x, 0xb4) != 0x8001
+      || and_loop (x, 0xb5) != 1)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
new file mode 100644
index 00000000000..f99ef4aa865
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
new file mode 100644
index 00000000000..5b41560d2ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_11.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 42) != 20522
+      || max_loop (x, 65503) != 65504
+      || max_loop (x, 65505) != 65505
+      || or_loop (x, 0) != 0xfffe
+      || or_loop (x, 1) != 0xffff
+      || eor_loop (x, 0) != 0xa000
+      || eor_loop (x, 0xbfff) != 0x1fff)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 32) != 31
+      || min_loop (x, 30) != 30
+      || and_loop (x, 0xff) != 1
+      || and_loop (x, 0) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
new file mode 100644
index 00000000000..d32b81a61bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
new file mode 100644
index 00000000000..929b81a9705
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
@@ -0,0 +1,66 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_12.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0, 10) != 10
+      || add_loop (x, 11, 42) != 614
+      || add_loop (x, 0x100, 84) != 22100
+      || add_loop (x, 0xfff, 20) != 20500
+      || max_loop (x, 0, 10) != 10
+      || max_loop (x, 11, 131) != 132
+      || max_loop (x, 11, 133) != 133
+      || max_loop (x, 0x100, 65279) != 65280
+      || max_loop (x, 0x100, 65281) != 65281
+      || max_loop (x, 0xfff, 65503) != 65504
+      || max_loop (x, 0xfff, 65505) != 65505
+      || or_loop (x, 0, 0x71) != 0x71
+      || or_loop (x, 11, 0) != 0xfe
+      || or_loop (x, 11, 0xb3c) != 0xbfe
+      || or_loop (x, 0x80, 0) != 0x7ffe
+      || or_loop (x, 0x80, 1) != 0x7fff
+      || or_loop (x, 0xb4, 0) != 0x7ffe
+      || or_loop (x, 0xb4, 1) != 0x7fff
+      || or_loop (x, 0xb5, 0) != 0xfffe
+      || or_loop (x, 0xb5, 1) != 0xffff
+      || eor_loop (x, 0, 0x3e) != 0x3e
+      || eor_loop (x, 11, 0) != 0xe8
+      || eor_loop (x, 11, 0x1ff) != 0x117
+      || eor_loop (x, 0x100, 0) != 0xcf00
+      || eor_loop (x, 0x100, 0xeee) != 0xc1ee
+      || eor_loop (x, 0xfff, 0) != 0xa000
+      || eor_loop (x, 0xfff, 0x8888) != 0x2888)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0, 10000) != 10000
+      || min_loop (x, 11, 65404) != 65403
+      || min_loop (x, 11, 65402) != 65402
+      || min_loop (x, 0x100, 256) != 255
+      || min_loop (x, 0x100, 254) != 254
+      || min_loop (x, 0xfff, 32) != 31
+      || min_loop (x, 0xfff, 30) != 30
+      || and_loop (x, 0, 0x1234) != 0x1234
+      || and_loop (x, 11, 0xffff) != 0xff01
+      || and_loop (x, 11, 0xcdef) != 0xcd01
+      || and_loop (x, 0x80, 0xffff) != 0x8001
+      || and_loop (x, 0x80, 0xfffe) != 0x8000
+      || and_loop (x, 0xb4, 0xffff) != 0x8001
+      || and_loop (x, 0xb4, 0xfffe) != 0x8000
+      || and_loop (x, 0xb5, 0xffff) != 1
+      || and_loop (x, 0xb5, 0xfffe) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
new file mode 100644
index 00000000000..ce2b8f2fcdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
@@ -0,0 +1,101 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
new file mode 100644
index 00000000000..5514d8d6b3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_13.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res[2] = { 42, 1111 };
+  add_loop (x, add_res);
+  if (add_res[0] != 968538154
+      || add_res[1] != 964340823)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 0, 0 };
+  max_loop (x, max_res1);
+  if (max_res1[0] != 1048150
+      || max_res1[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 1048151, 1045507 };
+  max_loop (x, max_res2);
+  if (max_res2[0] != 1048151
+      || max_res2[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int or_res[2] = { 0x1000000, 0x2000000 };
+  or_loop (x, or_res);
+  if (or_res[0] != 0x10ffffe
+      || or_res[1] != 0x20ffffe)
+    __builtin_abort ();
+
+  unsigned int eor_res[2] = { 0x1000000, 0x2000000 };
+  eor_loop (x, eor_res);
+  if (eor_res[0] != 0x1010000
+      || eor_res[1] != 0x20b5000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 500, 4000 };
+  min_loop (x, min_res1);
+  if (min_res1[0] != 425
+      || min_res1[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 424, 3068 };
+  min_loop (x, min_res2);
+  if (min_res2[0] != 424
+      || min_res2[1] != 3068)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
new file mode 100644
index 00000000000..3be611e4b37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
@@ -0,0 +1,107 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
new file mode 100644
index 00000000000..ccaa770e9b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
@@ -0,0 +1,187 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_14.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res1[2] = { 11, 22 };
+  add_loop (x, 0, add_res1);
+  if (add_res1[0] != 11
+      || add_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int add_res2[2] = { 10, 20 };
+  add_loop (x, 11, add_res2);
+  if (add_res2[0] != 1902
+      || add_res2[1] != 2176)
+    __builtin_abort ();
+
+  unsigned int add_res3[2] = { 15, 30 };
+  add_loop (x, 0x100, add_res3);
+  if (add_res3[0] != 22435087
+      || add_res3[1] != 22566686)
+    __builtin_abort ();
+
+  unsigned int add_res4[2] = { 100, 200 };
+  add_loop (x, 0x11f, add_res4);
+  if (add_res4[0] != 31602244
+      || add_res4[1] != 31767656)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 461, 500 };
+  max_loop (x, 11, max_res1);
+  if (max_res1[0] != 462
+      || max_res1[1] != 506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 463, 507 };
+  max_loop (x, 11, max_res2);
+  if (max_res2[0] != 463
+      || max_res2[1] != 507)
+    __builtin_abort ();
+
+  unsigned int max_res3[2] = { 1000000, 1000000 };
+  max_loop (x, 0x200, max_res3);
+  if (max_res3[0] != 1047552
+      || max_res3[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res4[2] = { 1047553, 1045507 };
+  max_loop (x, 0x200, max_res4);
+  if (max_res4[0] != 1047553
+      || max_res4[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int max_res5[2] = { 300000, 30000 };
+  max_loop (x, 0x11f, max_res5);
+  if (max_res5[0] != 328902
+      || max_res5[1] != 330050)
+    __builtin_abort ();
+
+  unsigned int max_res6[2] = { 328903, 330051 };
+  max_loop (x, 0x11f, max_res6);
+  if (max_res6[0] != 328903
+      || max_res6[1] != 330051)
+    __builtin_abort ();
+
+  unsigned int or_res1[2] = { 11, 22 };
+  or_loop (x, 0, or_res1);
+  if (or_res1[0] != 11
+      || or_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int or_res2[2] = { 0x200000, 0xe00000 };
+  or_loop (x, 11, or_res2);
+  if (or_res2[0] != 0x2001fe
+      || or_res2[1] != 0xe001fe)
+    __builtin_abort ();
+
+  unsigned int or_res3[2] = { 0x800000, 0x700000 };
+  or_loop (x, 0x40, or_res3);
+  if (or_res3[0] != 0x803ffe
+      || or_res3[1] != 0x707ffe)
+    __builtin_abort ();
+
+  unsigned int or_res4[2] = { 0x100001, 0x300000 };
+  or_loop (x, 0x4f, or_res4);
+  if (or_res4[0] != 0x107fff
+      || or_res4[1] != 0x307ffe)
+    __builtin_abort ();
+
+  unsigned int eor_res1[2] = { 11, 22 };
+  eor_loop (x, 0, eor_res1);
+  if (eor_res1[0] != 11
+      || eor_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff };
+  eor_loop (x, 11, eor_res2);
+  if (eor_res2[0] != 0x2001cf
+      || eor_res2[1] != 0xe000b7)
+    __builtin_abort ();
+
+  unsigned int eor_res3[2] = { 0x805000, 0x70f000 };
+  eor_loop (x, 0x100, eor_res3);
+  if (eor_res3[0] != 0x824200
+      || eor_res3[1] != 0x77dc00)
+    __builtin_abort ();
+
+  unsigned int eor_res4[2] = { 0x101201, 0x300f00 };
+  eor_loop (x, 0x11f, eor_res4);
+  if (eor_res4[0] != 0x178801
+      || eor_res4[1] != 0x337240)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 1048200, 1048100 };
+  min_loop (x, 11, min_res1);
+  if (min_res1[0] != 1048113
+      || min_res1[1] != 1048069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 1048112, 1048068 };
+  min_loop (x, 11, min_res2);
+  if (min_res2[0] != 1048112
+      || min_res2[1] != 1048068)
+    __builtin_abort ();
+
+  unsigned int min_res3[2] = { 10000, 10000 };
+  min_loop (x, 0x200, min_res3);
+  if (min_res3[0] != 1023
+      || min_res3[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res4[2] = { 1022, 3068 };
+  min_loop (x, 0x200, min_res4);
+  if (min_res4[0] != 1022
+      || min_res4[1] != 3068)
+    __builtin_abort ();
+
+  unsigned int min_res5[2] = { 719680, 718530 };
+  min_loop (x, 0x11f, min_res5);
+  if (min_res5[0] != 719673
+      || min_res5[1] != 718525)
+    __builtin_abort ();
+
+  unsigned int min_res6[2] = { 719672, 718524 };
+  min_loop (x, 0x11f, min_res6);
+  if (min_res6[0] != 719672
+      || min_res6[1] != 718524)
+    __builtin_abort ();
+
+  unsigned int and_res1[2] = { 11, 22 };
+  and_loop (x, 0, and_res1);
+  if (and_res1[0] != 11
+      || and_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int and_res2[2] = { 0xf5cff, 0xf78ff };
+  and_loop (x, 11, and_res2);
+  if (and_res2[0] != 0xf5c01
+      || and_res2[1] != 0xf7801)
+    __builtin_abort ();
+
+  unsigned int and_res3[2] = { 0x7efff, 0xecfff };
+  and_loop (x, 0x40, and_res3);
+  if (and_res3[0] != 0x7c001
+      || and_res3[1] != 0xe8001)
+    __builtin_abort ();
+
+  unsigned int and_res4[2] = { 0xffffff, 0xffffff };
+  and_loop (x, 0x4f, and_res4);
+  if (and_res4[0] != 0xf8001
+      || and_res4[1] != 0xf8001)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
new file mode 100644
index 00000000000..15b1ade30e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
@@ -0,0 +1,16 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+int __attribute__((noipa))
+add_loop (int *x, int n, int res)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      res += x[i * 2];
+      res += x[i * 2 + 1];
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
new file mode 100644
index 00000000000..3207fce5be3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
@@ -0,0 +1,22 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_15.c"
+
+int
+main (void)
+{
+  int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  if (add_loop (x, 0, 33) != 33
+      || add_loop (x, 11, 30) != 4078
+      || add_loop (x, 0x100, 45) != 45001773
+      || add_loop (x, 0x11f, 300) != 63369900)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
new file mode 100644
index 00000000000..b839821d6bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
new file mode 100644
index 00000000000..aa248f53eaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_9.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x) != 20480
+      || max_loop (x) != 65504
+      || or_loop (x) != 0xfffe
+      || eor_loop (x) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x) != 31
+      || and_loop (x) != 1)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 2909e8a0fc3..b7b0523e3c8 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
 }
 
+/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
+   from the corresponding main loop.  Return a value that is available in
+   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
+   Passing a null SKIP_VALUE is equivalent to passing zero.  */
+
+tree
+vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
+			   tree skip_value)
+{
+  if (!loop_vinfo->main_loop_edge)
+    return main_loop_value;
+
+  if (!skip_value)
+    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
+
+  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
+  basic_block bb = loop_vinfo->main_loop_edge->dest;
+  gphi *new_phi = create_phi_node (phi_result, bb);
+  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
+	       UNKNOWN_LOCATION);
+  add_phi_arg (new_phi, skip_value,
+	       loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
+  return phi_result;
+}
+
 /* Function vect_do_peeling.
 
    Input:
@@ -2986,6 +3011,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 					   skip_vector ? anchor : guard_bb,
 					   prob_epilog.invert (),
 					   irred_flag);
+	  if (vect_epilogues)
+	    epilogue_vinfo->skip_this_loop_edge = guard_e;
 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
 					      single_exit (epilog));
 	  /* Only need to handle basic block before epilog loop if it's not
@@ -3057,6 +3084,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
 		       UNKNOWN_LOCATION);
 	  niters = PHI_RESULT (new_phi);
+	  epilogue_vinfo->main_loop_edge = update_e;
+	  epilogue_vinfo->skip_main_loop_edge = skip_e;
 	}
 
       /* Set ADVANCE to the number of iterations performed by the previous
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index bc523d151c6..5e6c9b7c38a 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     th (0),
     versioning_threshold (0),
     vectorization_factor (0),
+    main_loop_edge (nullptr),
+    skip_main_loop_edge (nullptr),
+    skip_this_loop_edge (nullptr),
+    reusable_accumulators (),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
@@ -3248,23 +3253,15 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
     }
 }
 
-/* If there is a neutral value X such that SLP reduction NODE would not
-   be affected by the introduction of additional X elements, return that X,
-   otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
-   is the vector type that would hold element X.  REDUC_CHAIN is true if
-   the SLP statements perform a single reduction, false if each statement
-   performs an independent reduction.  */
+/* If there is a neutral value X such that a reduction would not be affected
+   by the introduction of additional X elements, return that X, otherwise
+   return null.  CODE is the code of the reduction and SCALAR_TYPE is type
+   of the scalar elements.  If the reduction has just a single initial value
+   then INITIAL_VALUE is that value, otherwise it is null.  */
 
 static tree
-neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
-			      tree_code code, bool reduc_chain)
+neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
-  tree scalar_type = TREE_TYPE (vector_type);
-  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
-  gcc_assert (loop);
-
   switch (code)
     {
     case WIDEN_SUM_EXPR:
@@ -3284,13 +3281,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
 
     case MAX_EXPR:
     case MIN_EXPR:
-      /* For MIN/MAX the initial values are neutral.  A reduction chain
-	 has only a single initial value, so that value is neutral for
-	 all statements.  */
-      if (reduc_chain)
-	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
-				      loop_preheader_edge (loop));
-      return NULL_TREE;
+      return initial_value;
 
     default:
       return NULL_TREE;
@@ -4621,64 +4612,58 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
                  prologue_cost, epilogue_cost);
 }
 
+/* SEQ is a sequence of instructions that initialize the reduction
+   described by REDUC_INFO.  Emit them in the appropriate place.  */
 
+static void
+vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info, gimple *seq)
+{
+  if (reduc_info->reused_accumulator)
+    {
+      /* When reusing an accumulator from the main loop, we only need
+	 initialization instructions if the main loop can be skipped.
+	 In that case, emit the initialization instructions at the end
+	 of the guard block that does the skip.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      gcc_assert (skip_edge);
+      gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
+      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+    }
+  else
+    {
+      /* The normal case: emit the initialization instructions on the
+	 preheader edge.  */
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
+    }
+}
 
 /* Function get_initial_def_for_reduction
 
    Input:
-   STMT_VINFO - a stmt that performs a reduction operation in the loop.
+   REDUC_INFO - the info_for_reduction
    INIT_VAL - the initial value of the reduction variable
+   NEUTRAL_OP - a value that has no effect on the reduction, as per
+		neutral_op_for_reduction
 
    Output:
-   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
-        of the reduction (used for adjusting the epilog - see below).
    Return a vector variable, initialized according to the operation that
 	STMT_VINFO performs. This vector will be used as the initial value
 	of the vector of partial results.
 
-   Option1 (adjust in epilog): Initialize the vector as follows:
-     add/bit or/xor:    [0,0,...,0,0]
-     mult/bit and:      [1,1,...,1,1]
-     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
-   and when necessary (e.g. add/mult case) let the caller know
-   that it needs to adjust the result by init_val.
-
-   Option2: Initialize the vector as follows:
-     add/bit or/xor:    [init_val,0,0,...,0]
-     mult/bit and:      [init_val,1,1,...,1]
-     min/max/cond_expr: [init_val,init_val,...,init_val]
-   and no adjustments are needed.
-
-   For example, for the following code:
-
-   s = init_val;
-   for (i=0;i<n;i++)
-     s = s + a[i];
-
-   STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
-   For a vector of 4 units, we want to return either [0,0,0,init_val],
-   or [0,0,0,0] and let the caller know that it needs to adjust
-   the result at the end by 'init_val'.
-
-   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
-   initialization vector is simpler (same element in all entries), if
-   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
-
-   A cost model should help decide between these two schemes.  */
+   The value we need is a vector in which element 0 has value INIT_VAL
+   and every other element has value NEUTRAL_OP.  */
 
 static tree
 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
-			       stmt_vec_info stmt_vinfo,
-			       enum tree_code code, tree init_val,
-                               tree *adjustment_def)
+			       stmt_vec_info reduc_info,
+			       tree init_val, tree neutral_op)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
-  tree def_for_init;
   tree init_def;
-  REAL_VALUE_TYPE real_init_val = dconst0;
-  int int_init_val = 0;
   gimple_seq stmts = NULL;
 
   gcc_assert (vectype);
@@ -4686,115 +4671,64 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
 
-  gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
-	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
+  gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
+	      || loop == (gimple_bb (reduc_info->stmt))->loop_father);
 
-  /* ADJUSTMENT_DEF is NULL when called from
-     vect_create_epilog_for_reduction to vectorize double reduction.  */
-  if (adjustment_def)
-    *adjustment_def = NULL;
-
-  switch (code)
+  if (operand_equal_p (init_val, neutral_op))
     {
-    case WIDEN_SUM_EXPR:
-    case DOT_PROD_EXPR:
-    case SAD_EXPR:
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-    case BIT_IOR_EXPR:
-    case BIT_XOR_EXPR:
-    case MULT_EXPR:
-    case BIT_AND_EXPR:
-      {
-        if (code == MULT_EXPR)
-          {
-            real_init_val = dconst1;
-            int_init_val = 1;
-          }
-
-        if (code == BIT_AND_EXPR)
-          int_init_val = -1;
-
-        if (SCALAR_FLOAT_TYPE_P (scalar_type))
-          def_for_init = build_real (scalar_type, real_init_val);
-        else
-          def_for_init = build_int_cst (scalar_type, int_init_val);
-
-	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
-	  {
-	    /* Option1: the first element is '0' or '1' as well.  */
-	    if (!operand_equal_p (def_for_init, init_val, 0))
-	      *adjustment_def = init_val;
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	  }
-	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
-	  {
-	    /* Option2 (variable length): the first element is INIT_VAL.  */
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
-				     vectype, init_def, init_val);
-	  }
-	else
-	  {
-	    /* Option2: the first element is INIT_VAL.  */
-	    tree_vector_builder elts (vectype, 1, 2);
-	    elts.quick_push (init_val);
-	    elts.quick_push (def_for_init);
-	    init_def = gimple_build_vector (&stmts, &elts);
-	  }
-      }
-      break;
-
-    case MIN_EXPR:
-    case MAX_EXPR:
-    case COND_EXPR:
-      {
-	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
-	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
-      }
-      break;
-
-    default:
-      gcc_unreachable ();
+      /* If both elements are equal then the vector described above is
+	 just a splat.  */
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
+    }
+  else
+    {
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
+	{
+	  /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
+	     element 0.  */
+	  init_def = gimple_build_vector_from_val (&stmts, vectype,
+						   neutral_op);
+	  init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
+				   vectype, init_def, init_val);
+	}
+      else
+	{
+	  /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
+	  tree_vector_builder elts (vectype, 1, 2);
+	  elts.quick_push (init_val);
+	  elts.quick_push (neutral_op);
+	  init_def = gimple_build_vector (&stmts, &elts);
+	}
     }
 
   if (stmts)
-    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
   return init_def;
 }
 
-/* Get at the initial defs for the reduction PHIs in SLP_NODE.
-   NUMBER_OF_VECTORS is the number of vector defs to create.
-   If NEUTRAL_OP is nonnull, introducing extra elements of that
-   value will not change the result.  */
+/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
+   which performs a reduction involving GROUP_SIZE scalar statements.
+   NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
+   is nonnull, introducing extra elements of that value will not change the
+   result.  */
 
 static void
-get_initial_defs_for_reduction (vec_info *vinfo,
-				slp_tree slp_node,
+get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info,
 				vec<tree> *vec_oprnds,
 				unsigned int number_of_vectors,
-				bool reduc_chain, tree neutral_op)
+				unsigned int group_size, tree neutral_op)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
+  vec<tree> &initial_values = reduc_info->reduc_initial_values;
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
-  tree vector_type;
-  unsigned int group_size = stmts.length ();
+  tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
   unsigned int i;
-  class loop *loop;
-
-  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
-
-  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
 
-  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
-  gcc_assert (loop);
-  edge pe = loop_preheader_edge (loop);
-
-  gcc_assert (!reduc_chain || neutral_op);
+  gcc_assert (group_size == initial_values.length () || neutral_op);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -4824,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
     {
       tree op;
       i = j % group_size;
-      stmt_vinfo = stmts[i];
 
       /* Get the def before the loop.  In reduction chain we have only
 	 one initial value.  Else we have as many as PHIs in the group.  */
-      if (reduc_chain)
-	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
-      else if (((vec_oprnds->length () + 1) * nunits
-		- number_of_places_left_in_vector >= group_size)
-	       && neutral_op)
+      if (i >= initial_values.length () || (j > i && neutral_op))
 	op = neutral_op;
       else
-	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+	op = initial_values[i];
 
       /* Create 'vect_ = {op0,op1,...,opn}'.  */
       number_of_places_left_in_vector--;
@@ -4871,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	    {
 	      /* First time round, duplicate ELTS to fill the
 		 required number of vectors.  */
-	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
-					number_of_vectors, *vec_oprnds);
+	      duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
+					elts, number_of_vectors, *vec_oprnds);
 	      break;
 	    }
 	  vec_oprnds->quick_push (init);
@@ -4884,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	}
     }
   if (ctor_seq != NULL)
-    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
 }
 
 /* For a statement STMT_INFO taking part in a reduction operation return
@@ -4906,15 +4835,107 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
     }
   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
-      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
-      stmt_vec_info info
-	  = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
+      stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
 	stmt_info = info;
     }
   return stmt_info;
 }
 
+/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
+   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
+   matching reduction that we can build on.  Adjust REDUC_INFO and return true
+   if so, otherwise return false.  */
+
+static bool
+vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info)
+{
+  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+  if (!main_loop_vinfo)
+    return false;
+
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
+    return false;
+
+  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
+  auto_vec<tree, 16> main_loop_results (num_phis);
+  auto_vec<tree, 16> initial_values (num_phis);
+  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
+    {
+      /* The epilogue loop can be entered either from the main loop or
+	 from an earlier guard block.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      for (tree incoming_value : reduc_info->reduc_initial_values)
+	{
+	  /* Look for:
+
+	       INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
+				    INITIAL_VALUE(guard block)>.  */
+	  gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
+
+	  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
+	  gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
+
+	  tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
+	  tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
+
+	  main_loop_results.quick_push (from_main_loop);
+	  initial_values.quick_push (from_skip);
+	}
+    }
+  else
+    /* The main loop dominates the epilogue loop.  */
+    main_loop_results.splice (reduc_info->reduc_initial_values);
+
+  /* See if the main loop has the kind of accumulator we need.  */
+  vect_reusable_accumulator *accumulator
+    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
+  if (!accumulator
+      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
+      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
+		      accumulator->reduc_info->reduc_scalar_results.begin ()))
+    return false;
+
+  /* For now, only handle the case in which both loops are operating on the
+     same vector types.  In future we could reduce wider vectors to narrower
+     ones as well.  */
+  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
+  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
+  if (!useless_type_conversion_p (old_vectype, vectype))
+    return false;
+
+  /* Non-SLP reductions might apply an adjustment after the reduction
+     operation, in order to simplify the initialization of the accumulator.
+     If the epilogue loop carries on from where the main loop left off,
+     it should apply the same adjustment to the final reduction result.
+
+     If the epilogue loop can also be entered directly (rather than via
+     the main loop), we need to be able to handle that case in the same way,
+     with the same adjustment.  (In principle we could add a PHI node
+     to select the correct adjustment, but in practice that shouldn't be
+     necessary.)  */
+  tree main_adjustment
+    = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
+  if (loop_vinfo->main_loop_edge && main_adjustment)
+    {
+      gcc_assert (num_phis == 1);
+      tree initial_value = initial_values[0];
+      /* Check that we can use INITIAL_VALUE as the adjustment and
+	 initialize the accumulator with a neutral value instead.  */
+      if (!operand_equal_p (initial_value, main_adjustment))
+	return false;
+      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+      initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
+						    code, initial_value);
+    }
+  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
+  reduc_info->reduc_initial_values.truncate (0);
+  reduc_info->reduc_initial_values.splice (initial_values);
+  reduc_info->reused_accumulator = accumulator;
+  return true;
+}
+
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -5005,15 +5026,18 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   imm_use_iterator imm_iter, phi_imm_iter;
   use_operand_p use_p, phi_use_p;
   gimple *use_stmt;
-  bool nested_in_vect_loop = false;
-  auto_vec<gimple *> new_phis;
+  auto_vec<tree> reduc_inputs;
   int j, i;
-  auto_vec<tree> scalar_results;
+  vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
   unsigned int group_size = 1, k;
   auto_vec<gimple *> phis;
-  bool slp_reduc = false;
+  /* SLP reduction without reduction chain, e.g.,
+     # a1 = phi <a2, a0>
+     # b1 = phi <b2, b0>
+     a2 = operation (a1)
+     b2 = operation (b1)  */
+  bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
   bool direct_slp_reduc;
-  tree new_phi_result;
   tree induction_index = NULL_TREE;
 
   if (slp_node)
@@ -5023,38 +5047,39 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       outer_loop = loop;
       loop = loop->inner;
-      nested_in_vect_loop = true;
-      gcc_assert (!slp_node);
+      gcc_assert (!slp_node && double_reduc);
     }
-  gcc_assert (!nested_in_vect_loop || double_reduc);
 
   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
   gcc_assert (vectype);
   mode = TYPE_MODE (vectype);
 
-  tree initial_def = NULL;
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
     ;
   else
     {
-      /* Get at the scalar def before the loop, that defines the initial value
-	 of the reduction variable.  */
-      initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
-					   loop_preheader_edge (loop));
       /* Optimize: for induction condition reduction, if we can't use zero
          for induc_val, use initial_def.  */
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
       else if (double_reduc)
 	;
-      else if (nested_in_vect_loop)
-	;
       else
 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
     }
 
+  stmt_vec_info single_live_out_stmt[] = { stmt_info };
+  array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
+  if (slp_reduc)
+    /* All statements produce live-out values.  */
+    live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+  else if (slp_node)
+    /* The last statement in the reduction chain produces the live-out
+       value.  */
+    single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+
   unsigned vec_num;
   int ncopies;
   if (slp_node)
@@ -5205,31 +5230,28 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   if (double_reduc)
     loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
-  new_phis.create (slp_node ? vec_num : ncopies);
+  exit_gsi = gsi_after_labels (exit_bb);
+  reduc_inputs.create (slp_node ? vec_num : ncopies);
   for (unsigned i = 0; i < vec_num; i++)
     {
+      gimple_seq stmts = NULL;
       if (slp_node)
 	def = vect_get_slp_vect_def (slp_node, i);
       else
 	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
       for (j = 0; j < ncopies; j++)
-        {
+	{
 	  tree new_def = copy_ssa_name (def);
-          phi = create_phi_node (new_def, exit_bb);
-          if (j == 0)
-            new_phis.quick_push (phi);
-          else
-	    {
-	      def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	      new_phis.quick_push (phi);
-	    }
-
-          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-        }
+	  phi = create_phi_node (new_def, exit_bb);
+	  if (j)
+	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+	  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
+	  new_def = gimple_convert (&stmts, vectype, new_def);
+	  reduc_inputs.quick_push (new_def);
+	}
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
     }
 
-  exit_gsi = gsi_after_labels (exit_bb);
-
   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
          (i.e. when reduc_fn is not available) and in the final adjustment
 	 code (if needed).  Also get the original scalar reduction variable as
@@ -5253,13 +5275,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
   bitsize = TYPE_SIZE (scalar_type);
 
-  /* SLP reduction without reduction chain, e.g.,
-     # a1 = phi <a2, a0>
-     # b1 = phi <b2, b0>
-     a2 = operation (a1)
-     b2 = operation (b1)  */
-  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
-
   /* True if we should implement SLP_REDUC using native reduction operations
      instead of scalar operations.  */
   direct_slp_reduc = (reduc_fn != IFN_LAST
@@ -5271,52 +5286,60 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
      a2 = operation (a1)
      a3 = operation (a2),
 
-     we may end up with more than one vector result.  Here we reduce them to
-     one vector.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
+     we may end up with more than one vector result.  Here we reduce them
+     to one vector.
+
+     The same is true if we couldn't use a single defuse cycle.  */
+  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+      || direct_slp_reduc
+      || ncopies > 1)
     {
       gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (k = 1; k < new_phis.length (); k++)
-        {
-	  gimple *next_phi = new_phis[k];
-          tree second_vect = PHI_RESULT (next_phi);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
-          first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-        }
+      tree single_input = reduc_inputs[0];
+      for (k = 1; k < reduc_inputs.length (); k++)
+	single_input = gimple_build (&stmts, code, vectype,
+				     single_input, reduc_inputs[k]);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
+      reduc_inputs.truncate (0);
+      reduc_inputs.safe_push (single_input);
     }
-  /* Likewise if we couldn't use a single defuse cycle.  */
-  else if (ncopies > 1)
+
+  tree orig_reduc_input = reduc_inputs[0];
+
+  /* If this loop is an epilogue loop that can be skipped after the
+     main loop, we can only share a reduction operation between the
+     main loop and the epilogue if we put it at the target of the
+     skip edge.
+
+     We can still reuse accumulators if this check fails.  Doing so has
+     the minor(?) benefit of making the epilogue loop's scalar result
+     independent of the main loop's scalar result.  */
+  bool unify_with_main_loop_p = false;
+  if (reduc_info->reused_accumulator
+      && loop_vinfo->skip_this_loop_edge
+      && single_succ_p (exit_bb)
+      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
     {
-      gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (int k = 1; k < ncopies; ++k)
-	{
-	  tree second_vect = PHI_RESULT (new_phis[k]);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
-	  first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-	}
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
+      unify_with_main_loop_p = true;
+
+      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
+      reduc_inputs[0] = make_ssa_name (vectype);
+      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
+      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
+		   UNKNOWN_LOCATION);
+      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
+		   loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
+      exit_gsi = gsi_after_labels (reduc_block);
     }
-  else
-    new_phi_result = PHI_RESULT (new_phis[0]);
+
+  /* Shouldn't be used beyond this point.  */
+  exit_bb = nullptr;
 
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
-      /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
+      /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
 	 various data values where the condition matched and another vector
 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
 	 need to extract the last matching index (which will be the index with
@@ -5346,10 +5369,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Vector of {0, 0, 0,...}.  */
       tree zero_vec = build_zero_cst (vectype);
 
-      gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-
       /* Find maximum value from the vector of found indexes.  */
       tree max_index = make_ssa_name (index_scalar_type);
       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
@@ -5367,7 +5386,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
-	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
+	 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
 	 otherwise.  Only one value should match, resulting in a vector
 	 (VEC_COND) with one data value and the rest zeros.
 	 In the case where the loop never made any matches, every index will
@@ -5386,7 +5405,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 zero.  */
       tree vec_cond = make_ssa_name (vectype);
       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
-						   vec_compare, new_phi_result,
+						   vec_compare,
+						   reduc_inputs[0],
 						   zero_vec);
       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
 
@@ -5416,7 +5436,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Convert the reduced value back to the result type and set as the
 	 result.  */
-      stmts = NULL;
+      gimple_seq stmts = NULL;
       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
 			       data_reduc);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
@@ -5434,7 +5454,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     val = data_reduc[i], idx_val = induction_index[i];
 	 return val;  */
 
-      tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
+      tree data_eltype = TREE_TYPE (vectype);
       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
@@ -5458,7 +5478,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
 					     build3 (BIT_FIELD_REF,
 						     data_eltype,
-						     new_phi_result,
+						     reduc_inputs[0],
 						     bitsize_int (el_size),
 						     bitsize_int (off)));
 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
@@ -5510,10 +5530,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 			 "Reduce using direct vector reduction.\n");
 
       gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
-      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
+      vec_elem_type = TREE_TYPE (vectype);
       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
-			       vec_elem_type, new_phi_result);
+			       vec_elem_type, reduc_inputs[0]);
       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
@@ -5526,6 +5545,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5543,12 +5563,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 neutral value.  We can then do a normal reduction on each vector.  */
 
       /* Enforced by vectorizable_reduction.  */
-      gcc_assert (new_phis.length () == 1);
+      gcc_assert (reduc_inputs.length () == 1);
       gcc_assert (pow2p_hwi (group_size));
 
-      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
-      vec<stmt_vec_info> orig_phis
-	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
       gimple_seq seq = NULL;
 
       /* Build a vector {0, 1, 2, ...}, with the same number of elements
@@ -5571,10 +5588,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       tree neutral_op = NULL_TREE;
       if (slp_node)
 	{
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
-	  neutral_op
-	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
-					    vectype, code, first != NULL);
+	  tree initial_value = NULL_TREE;
+	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+	    initial_value = reduc_info->reduc_initial_values[0];
+	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
+						 initial_value);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5586,9 +5604,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     for MIN and MAX reduction, for example.  */
 	  if (!neutral_op)
 	    {
-	      tree scalar_value
-		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
-					 loop_preheader_edge (loop));
+	      tree scalar_value = reduc_info->reduc_initial_values[i];
 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
 					     scalar_value);
 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5599,7 +5615,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	     sel[j] = (index[j] == i);
 
-	     which selects the elements of NEW_PHI_RESULT that should
+	     which selects the elements of REDUC_INPUTS[0] that should
 	     be included in the result.  */
 	  tree compare_val = build_int_cst (index_elt_type, i);
 	  compare_val = build_vector_from_val (index_type, compare_val);
@@ -5608,11 +5624,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	  /* Calculate the equivalent of:
 
-	     vec = seq ? new_phi_result : vector_identity;
+	     vec = seq ? reduc_inputs[0] : vector_identity;
 
 	     VEC is now suitable for a full vector reduction.  */
 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
-				   sel, new_phi_result, vector_identity);
+				   sel, reduc_inputs[0], vector_identity);
 
 	  /* Do the reduction and convert it to the appropriate type.  */
 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
@@ -5627,7 +5643,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       bool reduce_with_shift;
       tree vec_temp;
 
-      gcc_assert (slp_reduc || new_phis.length () == 1);
+      gcc_assert (slp_reduc || reduc_inputs.length () == 1);
 
       /* See if the target wants to do the final (shift) reduction
 	 in a vector mode of smaller size and first reduce upper/lower
@@ -5637,7 +5653,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
       unsigned nunits1 = nunits;
       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
-	  && new_phis.length () == 1)
+	  && reduc_inputs.length () == 1)
 	{
 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
 	  /* For SLP reductions we have to make sure lanes match up, but
@@ -5669,7 +5685,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* First reduce the vector to the desired vector size we should
 	 do shift reduction on by combining upper and lower halves.  */
-      new_temp = new_phi_result;
+      new_temp = reduc_inputs[0];
       while (nunits > nunits1)
 	{
 	  nunits /= 2;
@@ -5748,7 +5764,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  new_temp = make_ssa_name (vectype1);
 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	  new_phis[0] = epilog_stmt;
+	  reduc_inputs[0] = new_temp;
 	}
 
       if (reduce_with_shift && !slp_reduc)
@@ -5829,13 +5845,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  int element_bitsize = tree_to_uhwi (bitsize);
 	  tree compute_type = TREE_TYPE (vectype);
 	  gimple_seq stmts = NULL;
-          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
+	  FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
             {
               int bit_offset;
-              if (gimple_code (new_phi) == GIMPLE_PHI)
-                vec_temp = PHI_RESULT (new_phi);
-              else
-                vec_temp = gimple_assign_lhs (new_phi);
 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
 				       vec_temp, bitsize, bitsize_zero_node);
 
@@ -5882,6 +5894,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 					  first_res, res);
                   scalar_results[j % group_size] = new_res;
                 }
+	      scalar_results.truncate (group_size);
 	      for (k = 0; k < group_size; k++)
 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
 						    scalar_results[k]);
@@ -5905,6 +5918,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tree tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5923,13 +5937,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       gcc_assert (!slp_reduc);
       gimple_seq stmts = NULL;
-      if (nested_in_vect_loop)
+      if (double_reduc)
 	{
-          new_phi = new_phis[0];
 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
 	  new_temp = gimple_build (&stmts, code, vectype,
-				   PHI_RESULT (new_phi), adjustment_def);
+				   reduc_inputs[0], adjustment_def);
 	}
       else
 	{
@@ -5942,21 +5955,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       epilog_stmt = gimple_seq_last_stmt (stmts);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      if (nested_in_vect_loop)
-        {
-          if (!double_reduc)
-            scalar_results.quick_push (new_temp);
-          else
-            scalar_results[0] = new_temp;
-        }
-      else
-        scalar_results[0] = new_temp;
-
-      new_phis[0] = epilog_stmt;
+      scalar_results[0] = new_temp;
     }
 
+  /* Record this operation if it could be reused by the epilogue loop.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
+      && !double_reduc)
+    loop_vinfo->reusable_accumulators.put (scalar_results[0],
+					   { orig_reduc_input, reduc_info });
+
   if (double_reduc)
-    loop = loop->inner;
+    loop = outer_loop;
 
   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
           phis with new adjusted scalar results, i.e., replace use <s_out0>
@@ -5983,47 +5992,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
           use <s_out4>  
           use <s_out4> */
 
-
-  /* In SLP reduction chain we reduce vector results into one vector if
-     necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
-     LHS of the last stmt in the reduction chain, since we are looking for
-     the loop exit phi node.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+  gcc_assert (live_out_stmts.size () == scalar_results.length ());
+  for (k = 0; k < live_out_stmts.size (); k++)
     {
-      stmt_vec_info dest_stmt_info
-	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
-      scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
-      group_size = 1;
-    }
-
-  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
-     case that REDUC_GROUP_SIZE is greater than vectorization factor).
-     Therefore, we need to match SCALAR_RESULTS with corresponding statements.
-     The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
-     correspond to the first vector stmt, etc.
-     (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
-  if (group_size > new_phis.length ())
-    gcc_assert (!(group_size % new_phis.length ()));
-
-  for (k = 0; k < group_size; k++)
-    {
-      if (slp_reduc)
-        {
-	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
-
-	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
-	  /* SLP statements can't participate in patterns.  */
-	  gcc_assert (!orig_stmt_info);
-	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
-        }
-
-      if (nested_in_vect_loop)
-        {
-          if (double_reduc)
-            loop = outer_loop;
-          else
-	    gcc_unreachable ();
-        }
+      stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
+      scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
 
       phis.create (3);
       /* Find the loop-closed-use at the loop exit of the original scalar
@@ -6058,6 +6031,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
         {
           /* Replace the uses:  */
           orig_name = PHI_RESULT (exit_phi);
+
+	  /* Look for a single use at the target of the skip edge.  */
+	  if (unify_with_main_loop_p)
+	    {
+	      use_operand_p use_p;
+	      gimple *user;
+	      if (!single_imm_use (orig_name, &use_p, &user))
+		gcc_unreachable ();
+	      orig_name = gimple_get_lhs (user);
+	    }
+
           scalar_result = scalar_results[k];
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
@@ -6830,10 +6814,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       else if (cond_reduc_dt == vect_constant_def)
 	{
 	  enum vect_def_type cond_initial_dt;
-	  tree cond_initial_val
-	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
-
-	  gcc_assert (cond_reduc_val != NULL_TREE);
+	  tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
 	  if (cond_initial_dt == vect_constant_def
 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
@@ -7026,9 +7007,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   if (slp_node)
-    neutral_op = neutral_op_for_slp_reduction
-      (slp_node_instance->reduc_phis, vectype_out, orig_code,
-       REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+    {
+      tree initial_value = NULL_TREE;
+      if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
+	initial_value = vect_phi_initial_value (reduc_def_phi);
+      neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					     orig_code, initial_value);
+    }
 
   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -7578,7 +7563,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					       vectype_out);
 
   /* Get the loop-entry arguments.  */
-  tree vec_initial_def;
+  tree vec_initial_def = NULL_TREE;
   auto_vec<tree> vec_initial_defs;
   if (slp_node)
     {
@@ -7592,22 +7577,40 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       else
 	{
 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-	  tree neutral_op
-	      = neutral_op_for_slp_reduction (slp_node, vectype_out,
-					      STMT_VINFO_REDUC_CODE (reduc_info),
-					      first != NULL);
-	  get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
-					  &vec_initial_defs, vec_num,
-					  first != NULL, neutral_op);
+	  vec<tree> &initial_values = reduc_info->reduc_initial_values;
+	  vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+
+	  unsigned int num_phis = stmts.length ();
+	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
+	    num_phis = 1;
+	  initial_values.reserve (num_phis);
+	  for (unsigned int i = 0; i < num_phis; ++i)
+	    {
+	      gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
+	      initial_values.quick_push (vect_phi_initial_value (this_phi));
+	    }
+	  if (vec_num == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!initial_values.is_empty ())
+	    {
+	      tree initial_value
+		= (num_phis == 1 ? initial_values[0] : NULL_TREE);
+	      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					    code, initial_value);
+	      get_initial_defs_for_reduction (loop_vinfo, reduc_info,
+					      &vec_initial_defs, vec_num,
+					      stmts.length (), neutral_op);
+	    }
 	}
     }
   else
     {
       /* Get at the scalar def before the loop, that defines the initial
 	 value of the reduction variable.  */
-      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
-						loop_preheader_edge (loop));
+      tree initial_def = vect_phi_initial_value (phi);
+      reduc_info->reduc_initial_values.safe_push (initial_def);
       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
 	 and we can't use zero for induc_val, use initial_def.  Similarly
 	 for REDUC_MIN and initial_def larger than the base.  */
@@ -7627,9 +7630,6 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
 	    }
 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
 	}
       else if (nested_cycle)
 	{
@@ -7639,23 +7639,59 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					 ncopies, initial_def,
 					 &vec_initial_defs);
 	}
+      else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
+	       || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
+	/* Fill the initial vector with the initial scalar value.  */
+	vec_initial_def
+	  = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
+					   initial_def, initial_def);
       else
 	{
-	  tree adjustment_def = NULL_TREE;
-	  tree *adjustment_defp = &adjustment_def;
-	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
-	    adjustment_defp = NULL;
-	  vec_initial_def
-	    = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
-					     initial_def, adjustment_defp);
-	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
+	  if (ncopies == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!reduc_info->reduc_initial_values.is_empty ())
+	    {
+	      initial_def = reduc_info->reduc_initial_values[0];
+	      enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (initial_def),
+					    code, initial_def);
+	      gcc_assert (neutral_op);
+	      /* Try to simplify the vector initialization by applying an
+		 adjustment after the reduction has been performed.  */
+	      if (!reduc_info->reused_accumulator
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+		  && !operand_equal_p (neutral_op, initial_def))
+		{
+		  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
+		    = initial_def;
+		  initial_def = neutral_op;
+		}
+	      vec_initial_def
+		= get_initial_def_for_reduction (loop_vinfo, reduc_info,
+						 initial_def, neutral_op);
+	    }
 	}
     }
 
+  if (vec_initial_def)
+    {
+      vec_initial_defs.create (ncopies);
+      for (i = 0; i < ncopies; ++i)
+	vec_initial_defs.quick_push (vec_initial_def);
+    }
+
+  if (auto *accumulator = reduc_info->reused_accumulator)
+    {
+      if (loop_vinfo->main_loop_edge)
+	vec_initial_defs[0]
+	  = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input,
+				       vec_initial_defs[0]);
+      else
+	vec_initial_defs.safe_push (accumulator->reduc_input);
+      gcc_assert (vec_initial_defs.length () == 1);
+    }
+
   /* Generate the reduction PHIs upfront.  */
   for (i = 0; i < vec_num; i++)
     {
@@ -8253,8 +8289,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       return true;
     }
 
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-				     loop_preheader_edge (iv_loop));
+  init_expr = vect_phi_initial_value (phi);
 
   gimple_seq stmts = NULL;
   if (!nested_in_vect_loop)
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 9748043f3ee..f1035a83826 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt)
   STMT_VINFO_SLP_VECT_ONLY (res) = false;
   STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false;
   STMT_VINFO_VEC_STMTS (res) = vNULL;
+  res->reduc_initial_values = vNULL;
+  res->reduc_scalar_results = vNULL;
 
   if (is_a <loop_vec_info> (this)
       && gimple_code (stmt) == GIMPLE_PHI
@@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
 	release_ssa_name (lhs);
     }
 
+  stmt_info->reduc_initial_values.release ();
+  stmt_info->reduc_scalar_results.release ();
   STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release ();
   STMT_VINFO_VEC_STMTS (stmt_info).release ();
   free (stmt_info);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fa28336d429..ed7a7738880 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -27,7 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
 #include "tree-hash-traits.h"
 #include "target.h"
 #include "internal-fn.h"
-
+#include "tree-ssa-operands.h"
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens;
 
 typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
 
+/* Information about a reduction accumulator from the main loop that could
+   conceivably be reused as the input to a reduction in an epilogue loop.  */
+struct vect_reusable_accumulator {
+  /* The final value of the accumulator, which forms the input to the
+     reduction operation.  */
+  tree reduc_input;
+
+  /* The stmt_vec_info that describes the reduction (i.e. the one for
+     which is_reduc_info is true).  */
+  stmt_vec_info reduc_info;
+};
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -588,6 +600,23 @@ public:
   /* Unrolling factor  */
   poly_uint64 vectorization_factor;
 
+  /* If this loop is an epilogue loop whose main loop can be skipped,
+     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
+     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
+     main loop and goes straight to this loop's preheader.
+
+     Both fields are null otherwise.  */
+  edge main_loop_edge;
+  edge skip_main_loop_edge;
+
+  /* If this loop is an epilogue loop that might be skipped after executing
+     the main loop, this edge is the one that skips the epilogue.  */
+  edge skip_this_loop_edge;
+
+  /* After vectorization, maps live-out SSA names to information about
+     the reductions that generated them.  */
+  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
+
   /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1186,6 +1215,21 @@ public:
   /* The vector type for performing the actual reduction.  */
   tree reduc_vectype;
 
+  /* If IS_REDUC_INFO is true and if the reduction is operating on N
+     elements in parallel, this vector gives the initial values of these
+     N elements.  */
+  vec<tree> reduc_initial_values;
+
+  /* If IS_REDUC_INFO is true and if the reduction is operating on N
+     elements in parallel, this vector gives the scalar result of each
+     reduction.  */
+  vec<tree> reduc_scalar_results;
+
+  /* Only meaningful if IS_REDUC_INFO.  If non-null, the reduction is
+     being performed by an epilogue loop and we have decided to reuse
+     this accumulator from the main loop.  */
+  vect_reusable_accumulator *reused_accumulator;
+
   /* Whether we force a single cycle PHI during reduction vectorization.  */
   bool force_single_cycle;
 
@@ -1369,6 +1413,19 @@ nested_in_vect_loop_p (class loop *loop, stmt_vec_info stmt_info)
 	  && (loop->inner == (gimple_bb (stmt_info->stmt))->loop_father));
 }
 
+/* PHI is either a scalar reduction phi or a scalar induction phi.
+   Return the initial value of the variable on entry to the containing
+   loop.  */
+
+static inline tree
+vect_phi_initial_value (gphi *phi)
+{
+  basic_block bb = gimple_bb (phi);
+  edge pe = loop_preheader_edge (bb->loop_father);
+  gcc_assert (pe->dest == bb);
+  return PHI_ARG_DEF_FROM_EDGE (phi, pe);
+}
+
 /* Return true if STMT_INFO should produce a vector mask type rather than
    a normal nonmask type.  */
 
@@ -1799,6 +1856,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
 				    tree *);
+extern tree vect_get_main_loop_result (loop_vec_info, tree, tree = NULL_TREE);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 01/10] vect: Simplify epilogue reduction code
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
@ 2021-07-08 12:39 ` Richard Sandiford
  2021-07-08 12:58   ` Richard Biener
  2021-07-08 12:39 ` [PATCH 02/10] vect: Create array_slice of live-out stmts Richard Sandiford
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:39 UTC (permalink / raw)
  To: gcc-patches

vect_create_epilog_for_reduction only handles two cases: single-loop
reductions and double reductions.  “nested cycles” (i.e. reductions
in the inner loop when vectorising an outer loop) are handled elsewhere
and don't need a vector->scalar reduction.

The function had variables called nested_in_vect_loop and double_reduc
and asserted that nested_in_vect_loop implied double_reduc, but it
still had code to handle nested_in_vect_loop && !double_reduc.
This patch removes that and uses double_reduc everywhere.

gcc/
	* tree-vect-loop.c (vect_create_epilog_for_reduction): Remove
	nested_in_vect_loop and use double_reduc everywhere.  Remove dead
	assignment to "loop".
---
 gcc/tree-vect-loop.c | 30 ++++--------------------------
 1 file changed, 4 insertions(+), 26 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index bc523d151c6..7c3e3352b43 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5005,7 +5005,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   imm_use_iterator imm_iter, phi_imm_iter;
   use_operand_p use_p, phi_use_p;
   gimple *use_stmt;
-  bool nested_in_vect_loop = false;
   auto_vec<gimple *> new_phis;
   int j, i;
   auto_vec<tree> scalar_results;
@@ -5023,10 +5022,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       outer_loop = loop;
       loop = loop->inner;
-      nested_in_vect_loop = true;
-      gcc_assert (!slp_node);
+      gcc_assert (!slp_node && double_reduc);
     }
-  gcc_assert (!nested_in_vect_loop || double_reduc);
 
   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
   gcc_assert (vectype);
@@ -5049,8 +5046,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
       else if (double_reduc)
 	;
-      else if (nested_in_vect_loop)
-	;
       else
 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
     }
@@ -5923,7 +5918,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       gcc_assert (!slp_reduc);
       gimple_seq stmts = NULL;
-      if (nested_in_vect_loop)
+      if (double_reduc)
 	{
           new_phi = new_phis[0];
 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
@@ -5942,21 +5937,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       epilog_stmt = gimple_seq_last_stmt (stmts);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      if (nested_in_vect_loop)
-        {
-          if (!double_reduc)
-            scalar_results.quick_push (new_temp);
-          else
-            scalar_results[0] = new_temp;
-        }
-      else
-        scalar_results[0] = new_temp;
-
+      scalar_results[0] = new_temp;
       new_phis[0] = epilog_stmt;
     }
 
   if (double_reduc)
-    loop = loop->inner;
+    loop = outer_loop;
 
   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
           phis with new adjusted scalar results, i.e., replace use <s_out0>
@@ -6017,14 +6003,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
         }
 
-      if (nested_in_vect_loop)
-        {
-          if (double_reduc)
-            loop = outer_loop;
-          else
-	    gcc_unreachable ();
-        }
-
       phis.create (3);
       /* Find the loop-closed-use at the loop exit of the original scalar
          result.  (The reduction result is expected to have two immediate uses,

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 01/10] vect: Simplify epilogue reduction code
  2021-07-08 12:39 ` [PATCH 01/10] vect: Simplify epilogue reduction code Richard Sandiford
@ 2021-07-08 12:58   ` Richard Biener
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Biener @ 2021-07-08 12:58 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:41 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> vect_create_epilog_for_reduction only handles two cases: single-loop
> reductions and double reductions.  “nested cycles” (i.e. reductions
> in the inner loop when vectorising an outer loop) are handled elsewhere
> and don't need a vector->scalar reduction.
>
> The function had variables called nested_in_vect_loop and double_reduc
> and asserted that nested_in_vect_loop implied double_reduc, but it
> still had code to handle nested_in_vect_loop && !double_reduc.
> This patch removes that and uses double_reduc everywhere.

OK.

(cleaning up after the GCC 10 time refactoring was still on my list :/)

> gcc/
>         * tree-vect-loop.c (vect_create_epilog_for_reduction): Remove
>         nested_in_vect_loop and use double_reduc everywhere.  Remove dead
>         assignment to "loop".
> ---
>  gcc/tree-vect-loop.c | 30 ++++--------------------------
>  1 file changed, 4 insertions(+), 26 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index bc523d151c6..7c3e3352b43 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -5005,7 +5005,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    imm_use_iterator imm_iter, phi_imm_iter;
>    use_operand_p use_p, phi_use_p;
>    gimple *use_stmt;
> -  bool nested_in_vect_loop = false;
>    auto_vec<gimple *> new_phis;
>    int j, i;
>    auto_vec<tree> scalar_results;
> @@ -5023,10 +5022,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>      {
>        outer_loop = loop;
>        loop = loop->inner;
> -      nested_in_vect_loop = true;
> -      gcc_assert (!slp_node);
> +      gcc_assert (!slp_node && double_reduc);
>      }
> -  gcc_assert (!nested_in_vect_loop || double_reduc);
>
>    vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
>    gcc_assert (vectype);
> @@ -5049,8 +5046,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
>        else if (double_reduc)
>         ;
> -      else if (nested_in_vect_loop)
> -       ;
>        else
>         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
>      }
> @@ -5923,7 +5918,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>      {
>        gcc_assert (!slp_reduc);
>        gimple_seq stmts = NULL;
> -      if (nested_in_vect_loop)
> +      if (double_reduc)
>         {
>            new_phi = new_phis[0];
>           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
> @@ -5942,21 +5937,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>
>        epilog_stmt = gimple_seq_last_stmt (stmts);
>        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> -      if (nested_in_vect_loop)
> -        {
> -          if (!double_reduc)
> -            scalar_results.quick_push (new_temp);
> -          else
> -            scalar_results[0] = new_temp;
> -        }
> -      else
> -        scalar_results[0] = new_temp;
> -
> +      scalar_results[0] = new_temp;
>        new_phis[0] = epilog_stmt;
>      }
>
>    if (double_reduc)
> -    loop = loop->inner;
> +    loop = outer_loop;
>
>    /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
>            phis with new adjusted scalar results, i.e., replace use <s_out0>
> @@ -6017,14 +6003,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
>          }
>
> -      if (nested_in_vect_loop)
> -        {
> -          if (double_reduc)
> -            loop = outer_loop;
> -          else
> -           gcc_unreachable ();
> -        }
> -
>        phis.create (3);
>        /* Find the loop-closed-use at the loop exit of the original scalar
>           result.  (The reduction result is expected to have two immediate uses,

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 02/10] vect: Create array_slice of live-out stmts
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
  2021-07-08 12:39 ` [PATCH 01/10] vect: Simplify epilogue reduction code Richard Sandiford
@ 2021-07-08 12:39 ` Richard Sandiford
  2021-07-08 12:58   ` Richard Biener
  2021-07-08 12:39 ` [PATCH 03/10] vect: Remove new_phis from Richard Sandiford
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:39 UTC (permalink / raw)
  To: gcc-patches

This patch constructs an array_slice of the scalar statements that
produce live-out reduction results in the original unvectorised loop.
There are three cases:

- SLP reduction chains: the final SLP stmt is live-out
- full SLP reductions: all SLP stmts are live-out
- non-SLP reductions: the single scalar stmt is live-out

This is a slight simplification on its own, mostly because it maans
“group_size” has a consistent meaning throughout the function.
The main justification though is that it helps with later patches.

gcc/
	* tree-vect-loop.c (vect_create_epilog_for_reduction): Truncate
	scalar_results to group_size elements after reducing down from
	N*group_size elements.  Construct an array_slice of the live-out
	stmts and assert that there is one stmt per scalar result.
---
 gcc/tree-vect-loop.c | 61 +++++++++++++++-----------------------------
 1 file changed, 21 insertions(+), 40 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 7c3e3352b43..8390ac80ca0 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5010,7 +5010,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   auto_vec<tree> scalar_results;
   unsigned int group_size = 1, k;
   auto_vec<gimple *> phis;
-  bool slp_reduc = false;
+  /* SLP reduction without reduction chain, e.g.,
+     # a1 = phi <a2, a0>
+     # b1 = phi <b2, b0>
+     a2 = operation (a1)
+     b2 = operation (b1)  */
+  bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
   bool direct_slp_reduc;
   tree new_phi_result;
   tree induction_index = NULL_TREE;
@@ -5050,6 +5055,16 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
     }
 
+  stmt_vec_info single_live_out_stmt[] = { stmt_info };
+  array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
+  if (slp_reduc)
+    /* All statements produce live-out values.  */
+    live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+  else if (slp_node)
+    /* The last statement in the reduction chain produces the live-out
+       value.  */
+    single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+
   unsigned vec_num;
   int ncopies;
   if (slp_node)
@@ -5248,13 +5263,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
   bitsize = TYPE_SIZE (scalar_type);
 
-  /* SLP reduction without reduction chain, e.g.,
-     # a1 = phi <a2, a0>
-     # b1 = phi <b2, b0>
-     a2 = operation (a1)
-     b2 = operation (b1)  */
-  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
-
   /* True if we should implement SLP_REDUC using native reduction operations
      instead of scalar operations.  */
   direct_slp_reduc = (reduc_fn != IFN_LAST
@@ -5877,6 +5885,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 					  first_res, res);
                   scalar_results[j % group_size] = new_res;
                 }
+	      scalar_results.truncate (group_size);
 	      for (k = 0; k < group_size; k++)
 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
 						    scalar_results[k]);
@@ -5969,39 +5978,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
           use <s_out4>  
           use <s_out4> */
 
-
-  /* In SLP reduction chain we reduce vector results into one vector if
-     necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
-     LHS of the last stmt in the reduction chain, since we are looking for
-     the loop exit phi node.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    {
-      stmt_vec_info dest_stmt_info
-	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
-      scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
-      group_size = 1;
-    }
-
-  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
-     case that REDUC_GROUP_SIZE is greater than vectorization factor).
-     Therefore, we need to match SCALAR_RESULTS with corresponding statements.
-     The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
-     correspond to the first vector stmt, etc.
-     (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
-  if (group_size > new_phis.length ())
-    gcc_assert (!(group_size % new_phis.length ()));
-
-  for (k = 0; k < group_size; k++)
+  gcc_assert (live_out_stmts.size () == scalar_results.length ());
+  for (k = 0; k < live_out_stmts.size (); k++)
     {
-      if (slp_reduc)
-        {
-	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
-
-	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
-	  /* SLP statements can't participate in patterns.  */
-	  gcc_assert (!orig_stmt_info);
-	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
-        }
+      stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
+      scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
 
       phis.create (3);
       /* Find the loop-closed-use at the loop exit of the original scalar

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 02/10] vect: Create array_slice of live-out stmts
  2021-07-08 12:39 ` [PATCH 02/10] vect: Create array_slice of live-out stmts Richard Sandiford
@ 2021-07-08 12:58   ` Richard Biener
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Biener @ 2021-07-08 12:58 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:42 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This patch constructs an array_slice of the scalar statements that
> produce live-out reduction results in the original unvectorised loop.
> There are three cases:
>
> - SLP reduction chains: the final SLP stmt is live-out
> - full SLP reductions: all SLP stmts are live-out
> - non-SLP reductions: the single scalar stmt is live-out
>
> This is a slight simplification on its own, mostly because it maans
> “group_size” has a consistent meaning throughout the function.
> The main justification though is that it helps with later patches.

OK

> gcc/
>         * tree-vect-loop.c (vect_create_epilog_for_reduction): Truncate
>         scalar_results to group_size elements after reducing down from
>         N*group_size elements.  Construct an array_slice of the live-out
>         stmts and assert that there is one stmt per scalar result.
> ---
>  gcc/tree-vect-loop.c | 61 +++++++++++++++-----------------------------
>  1 file changed, 21 insertions(+), 40 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 7c3e3352b43..8390ac80ca0 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -5010,7 +5010,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    auto_vec<tree> scalar_results;
>    unsigned int group_size = 1, k;
>    auto_vec<gimple *> phis;
> -  bool slp_reduc = false;
> +  /* SLP reduction without reduction chain, e.g.,
> +     # a1 = phi <a2, a0>
> +     # b1 = phi <b2, b0>
> +     a2 = operation (a1)
> +     b2 = operation (b1)  */
> +  bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
>    bool direct_slp_reduc;
>    tree new_phi_result;
>    tree induction_index = NULL_TREE;
> @@ -5050,6 +5055,16 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
>      }
>
> +  stmt_vec_info single_live_out_stmt[] = { stmt_info };
> +  array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
> +  if (slp_reduc)
> +    /* All statements produce live-out values.  */
> +    live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> +  else if (slp_node)
> +    /* The last statement in the reduction chain produces the live-out
> +       value.  */
> +    single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
> +
>    unsigned vec_num;
>    int ncopies;
>    if (slp_node)
> @@ -5248,13 +5263,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
>    bitsize = TYPE_SIZE (scalar_type);
>
> -  /* SLP reduction without reduction chain, e.g.,
> -     # a1 = phi <a2, a0>
> -     # b1 = phi <b2, b0>
> -     a2 = operation (a1)
> -     b2 = operation (b1)  */
> -  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
> -
>    /* True if we should implement SLP_REDUC using native reduction operations
>       instead of scalar operations.  */
>    direct_slp_reduc = (reduc_fn != IFN_LAST
> @@ -5877,6 +5885,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>                                           first_res, res);
>                    scalar_results[j % group_size] = new_res;
>                  }
> +             scalar_results.truncate (group_size);
>               for (k = 0; k < group_size; k++)
>                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
>                                                     scalar_results[k]);
> @@ -5969,39 +5978,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>            use <s_out4>
>            use <s_out4> */
>
> -
> -  /* In SLP reduction chain we reduce vector results into one vector if
> -     necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
> -     LHS of the last stmt in the reduction chain, since we are looking for
> -     the loop exit phi node.  */
> -  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
> -    {
> -      stmt_vec_info dest_stmt_info
> -       = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
> -      scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
> -      group_size = 1;
> -    }
> -
> -  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
> -     case that REDUC_GROUP_SIZE is greater than vectorization factor).
> -     Therefore, we need to match SCALAR_RESULTS with corresponding statements.
> -     The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
> -     correspond to the first vector stmt, etc.
> -     (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
> -  if (group_size > new_phis.length ())
> -    gcc_assert (!(group_size % new_phis.length ()));
> -
> -  for (k = 0; k < group_size; k++)
> +  gcc_assert (live_out_stmts.size () == scalar_results.length ());
> +  for (k = 0; k < live_out_stmts.size (); k++)
>      {
> -      if (slp_reduc)
> -        {
> -         stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
> -
> -         orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
> -         /* SLP statements can't participate in patterns.  */
> -         gcc_assert (!orig_stmt_info);
> -         scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
> -        }
> +      stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
> +      scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
>
>        phis.create (3);
>        /* Find the loop-closed-use at the loop exit of the original scalar

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 03/10] vect: Remove new_phis from
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
  2021-07-08 12:39 ` [PATCH 01/10] vect: Simplify epilogue reduction code Richard Sandiford
  2021-07-08 12:39 ` [PATCH 02/10] vect: Create array_slice of live-out stmts Richard Sandiford
@ 2021-07-08 12:39 ` Richard Sandiford
  2021-07-08 12:59   ` Richard Biener
  2021-07-08 12:40 ` [PATCH 04/10] vect: Ensure reduc_inputs always have vectype Richard Sandiford
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:39 UTC (permalink / raw)
  To: gcc-patches

vect_create_epilog_for_reduction had a variable called new_phis.
It collected the statements that produce the exit block definitions
of the vector reduction accumulators.  Although those statements
are indeed phis initially, they are often replaced with normal
statements later, leading to puzzling code like:

          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
            {
              int bit_offset;
              if (gimple_code (new_phi) == GIMPLE_PHI)
                vec_temp = PHI_RESULT (new_phi);
              else
                vec_temp = gimple_assign_lhs (new_phi);

Also, although the array collects statements, in practice all users want
the lhs instead.

This patch therefore replaces new_phis with a vector of gimple values
called “reduc_inputs”.

Also, reduction chains and ncopies>1 were handled with identical code
(and there was a comment saying so).  The patch unites them into
a single “if”.

gcc/
	* tree-vect-loop.c (vect_create_epilog_for_reduction): Replace
	the new_phis vector with a reduc_inputs vector.  Combine handling
	of reduction chains and ncopies > 1.
---
 gcc/tree-vect-loop.c | 113 ++++++++++++++++---------------------------
 1 file changed, 41 insertions(+), 72 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 8390ac80ca0..b7f73ca52c7 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5005,7 +5005,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   imm_use_iterator imm_iter, phi_imm_iter;
   use_operand_p use_p, phi_use_p;
   gimple *use_stmt;
-  auto_vec<gimple *> new_phis;
+  auto_vec<tree> reduc_inputs;
   int j, i;
   auto_vec<tree> scalar_results;
   unsigned int group_size = 1, k;
@@ -5017,7 +5017,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
      b2 = operation (b1)  */
   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
   bool direct_slp_reduc;
-  tree new_phi_result;
   tree induction_index = NULL_TREE;
 
   if (slp_node)
@@ -5215,7 +5214,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   if (double_reduc)
     loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
-  new_phis.create (slp_node ? vec_num : ncopies);
+  reduc_inputs.create (slp_node ? vec_num : ncopies);
   for (unsigned i = 0; i < vec_num; i++)
     {
       if (slp_node)
@@ -5223,19 +5222,14 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       else
 	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
       for (j = 0; j < ncopies; j++)
-        {
+	{
 	  tree new_def = copy_ssa_name (def);
-          phi = create_phi_node (new_def, exit_bb);
-          if (j == 0)
-            new_phis.quick_push (phi);
-          else
-	    {
-	      def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	      new_phis.quick_push (phi);
-	    }
-
-          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-        }
+	  phi = create_phi_node (new_def, exit_bb);
+	  if (j)
+	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+	  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
+	  reduc_inputs.quick_push (new_def);
+	}
     }
 
   exit_gsi = gsi_after_labels (exit_bb);
@@ -5274,52 +5268,32 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
      a2 = operation (a1)
      a3 = operation (a2),
 
-     we may end up with more than one vector result.  Here we reduce them to
-     one vector.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
+     we may end up with more than one vector result.  Here we reduce them
+     to one vector.
+
+     The same is true if we couldn't use a single defuse cycle.  */
+  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+      || direct_slp_reduc
+      || ncopies > 1)
     {
       gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (k = 1; k < new_phis.length (); k++)
+      tree first_vect = gimple_convert (&stmts, vectype, reduc_inputs[0]);
+      for (k = 1; k < reduc_inputs.length (); k++)
         {
-	  gimple *next_phi = new_phis[k];
-          tree second_vect = PHI_RESULT (next_phi);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
+	  tree second_vect = gimple_convert (&stmts, vectype, reduc_inputs[k]);
           first_vect = gimple_build (&stmts, code, vectype,
 				     first_vect, second_vect);
         }
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
+      reduc_inputs.truncate (0);
+      reduc_inputs.safe_push (first_vect);
     }
-  /* Likewise if we couldn't use a single defuse cycle.  */
-  else if (ncopies > 1)
-    {
-      gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (int k = 1; k < ncopies; ++k)
-	{
-	  tree second_vect = PHI_RESULT (new_phis[k]);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
-	  first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-	}
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
-    }
-  else
-    new_phi_result = PHI_RESULT (new_phis[0]);
 
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
-      /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
+      /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
 	 various data values where the condition matched and another vector
 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
 	 need to extract the last matching index (which will be the index with
@@ -5350,7 +5324,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       tree zero_vec = build_zero_cst (vectype);
 
       gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
+      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
       /* Find maximum value from the vector of found indexes.  */
@@ -5370,7 +5344,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
-	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
+	 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
 	 otherwise.  Only one value should match, resulting in a vector
 	 (VEC_COND) with one data value and the rest zeros.
 	 In the case where the loop never made any matches, every index will
@@ -5389,7 +5363,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 zero.  */
       tree vec_cond = make_ssa_name (vectype);
       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
-						   vec_compare, new_phi_result,
+						   vec_compare,
+						   reduc_inputs[0],
 						   zero_vec);
       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
 
@@ -5437,7 +5412,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     val = data_reduc[i], idx_val = induction_index[i];
 	 return val;  */
 
-      tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
+      tree data_eltype = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
@@ -5461,7 +5436,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
 					     build3 (BIT_FIELD_REF,
 						     data_eltype,
-						     new_phi_result,
+						     reduc_inputs[0],
 						     bitsize_int (el_size),
 						     bitsize_int (off)));
 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
@@ -5513,10 +5488,10 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 			 "Reduce using direct vector reduction.\n");
 
       gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
-      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
+      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
+      vec_elem_type = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
-			       vec_elem_type, new_phi_result);
+			       vec_elem_type, reduc_inputs[0]);
       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
@@ -5546,7 +5521,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 neutral value.  We can then do a normal reduction on each vector.  */
 
       /* Enforced by vectorizable_reduction.  */
-      gcc_assert (new_phis.length () == 1);
+      gcc_assert (reduc_inputs.length () == 1);
       gcc_assert (pow2p_hwi (group_size));
 
       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
@@ -5602,7 +5577,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	     sel[j] = (index[j] == i);
 
-	     which selects the elements of NEW_PHI_RESULT that should
+	     which selects the elements of REDUC_INPUTS[0] that should
 	     be included in the result.  */
 	  tree compare_val = build_int_cst (index_elt_type, i);
 	  compare_val = build_vector_from_val (index_type, compare_val);
@@ -5611,11 +5586,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	  /* Calculate the equivalent of:
 
-	     vec = seq ? new_phi_result : vector_identity;
+	     vec = seq ? reduc_inputs[0] : vector_identity;
 
 	     VEC is now suitable for a full vector reduction.  */
 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
-				   sel, new_phi_result, vector_identity);
+				   sel, reduc_inputs[0], vector_identity);
 
 	  /* Do the reduction and convert it to the appropriate type.  */
 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
@@ -5630,7 +5605,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       bool reduce_with_shift;
       tree vec_temp;
 
-      gcc_assert (slp_reduc || new_phis.length () == 1);
+      gcc_assert (slp_reduc || reduc_inputs.length () == 1);
 
       /* See if the target wants to do the final (shift) reduction
 	 in a vector mode of smaller size and first reduce upper/lower
@@ -5640,7 +5615,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
       unsigned nunits1 = nunits;
       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
-	  && new_phis.length () == 1)
+	  && reduc_inputs.length () == 1)
 	{
 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
 	  /* For SLP reductions we have to make sure lanes match up, but
@@ -5672,7 +5647,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* First reduce the vector to the desired vector size we should
 	 do shift reduction on by combining upper and lower halves.  */
-      new_temp = new_phi_result;
+      new_temp = reduc_inputs[0];
       while (nunits > nunits1)
 	{
 	  nunits /= 2;
@@ -5751,7 +5726,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  new_temp = make_ssa_name (vectype1);
 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	  new_phis[0] = epilog_stmt;
+	  reduc_inputs[0] = new_temp;
 	}
 
       if (reduce_with_shift && !slp_reduc)
@@ -5832,13 +5807,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  int element_bitsize = tree_to_uhwi (bitsize);
 	  tree compute_type = TREE_TYPE (vectype);
 	  gimple_seq stmts = NULL;
-          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
+	  FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
             {
               int bit_offset;
-              if (gimple_code (new_phi) == GIMPLE_PHI)
-                vec_temp = PHI_RESULT (new_phi);
-              else
-                vec_temp = gimple_assign_lhs (new_phi);
 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
 				       vec_temp, bitsize, bitsize_zero_node);
 
@@ -5929,11 +5900,10 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gimple_seq stmts = NULL;
       if (double_reduc)
 	{
-          new_phi = new_phis[0];
 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
 	  new_temp = gimple_build (&stmts, code, vectype,
-				   PHI_RESULT (new_phi), adjustment_def);
+				   reduc_inputs[0], adjustment_def);
 	}
       else
 	{
@@ -5947,7 +5917,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       epilog_stmt = gimple_seq_last_stmt (stmts);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
       scalar_results[0] = new_temp;
-      new_phis[0] = epilog_stmt;
     }
 
   if (double_reduc)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 03/10] vect: Remove new_phis from
  2021-07-08 12:39 ` [PATCH 03/10] vect: Remove new_phis from Richard Sandiford
@ 2021-07-08 12:59   ` Richard Biener
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Biener @ 2021-07-08 12:59 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:43 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> vect_create_epilog_for_reduction had a variable called new_phis.
> It collected the statements that produce the exit block definitions
> of the vector reduction accumulators.  Although those statements
> are indeed phis initially, they are often replaced with normal
> statements later, leading to puzzling code like:
>
>           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
>             {
>               int bit_offset;
>               if (gimple_code (new_phi) == GIMPLE_PHI)
>                 vec_temp = PHI_RESULT (new_phi);
>               else
>                 vec_temp = gimple_assign_lhs (new_phi);
>
> Also, although the array collects statements, in practice all users want
> the lhs instead.
>
> This patch therefore replaces new_phis with a vector of gimple values
> called “reduc_inputs”.
>
> Also, reduction chains and ncopies>1 were handled with identical code
> (and there was a comment saying so).  The patch unites them into
> a single “if”.

OK.

Thanks,
Richard.

> gcc/
>         * tree-vect-loop.c (vect_create_epilog_for_reduction): Replace
>         the new_phis vector with a reduc_inputs vector.  Combine handling
>         of reduction chains and ncopies > 1.
> ---
>  gcc/tree-vect-loop.c | 113 ++++++++++++++++---------------------------
>  1 file changed, 41 insertions(+), 72 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 8390ac80ca0..b7f73ca52c7 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -5005,7 +5005,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    imm_use_iterator imm_iter, phi_imm_iter;
>    use_operand_p use_p, phi_use_p;
>    gimple *use_stmt;
> -  auto_vec<gimple *> new_phis;
> +  auto_vec<tree> reduc_inputs;
>    int j, i;
>    auto_vec<tree> scalar_results;
>    unsigned int group_size = 1, k;
> @@ -5017,7 +5017,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>       b2 = operation (b1)  */
>    bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
>    bool direct_slp_reduc;
> -  tree new_phi_result;
>    tree induction_index = NULL_TREE;
>
>    if (slp_node)
> @@ -5215,7 +5214,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    if (double_reduc)
>      loop = outer_loop;
>    exit_bb = single_exit (loop)->dest;
> -  new_phis.create (slp_node ? vec_num : ncopies);
> +  reduc_inputs.create (slp_node ? vec_num : ncopies);
>    for (unsigned i = 0; i < vec_num; i++)
>      {
>        if (slp_node)
> @@ -5223,19 +5222,14 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        else
>         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
>        for (j = 0; j < ncopies; j++)
> -        {
> +       {
>           tree new_def = copy_ssa_name (def);
> -          phi = create_phi_node (new_def, exit_bb);
> -          if (j == 0)
> -            new_phis.quick_push (phi);
> -          else
> -           {
> -             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
> -             new_phis.quick_push (phi);
> -           }
> -
> -          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
> -        }
> +         phi = create_phi_node (new_def, exit_bb);
> +         if (j)
> +           def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
> +         SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
> +         reduc_inputs.quick_push (new_def);
> +       }
>      }
>
>    exit_gsi = gsi_after_labels (exit_bb);
> @@ -5274,52 +5268,32 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>       a2 = operation (a1)
>       a3 = operation (a2),
>
> -     we may end up with more than one vector result.  Here we reduce them to
> -     one vector.  */
> -  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
> +     we may end up with more than one vector result.  Here we reduce them
> +     to one vector.
> +
> +     The same is true if we couldn't use a single defuse cycle.  */
> +  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
> +      || direct_slp_reduc
> +      || ncopies > 1)
>      {
>        gimple_seq stmts = NULL;
> -      tree first_vect = PHI_RESULT (new_phis[0]);
> -      first_vect = gimple_convert (&stmts, vectype, first_vect);
> -      for (k = 1; k < new_phis.length (); k++)
> +      tree first_vect = gimple_convert (&stmts, vectype, reduc_inputs[0]);
> +      for (k = 1; k < reduc_inputs.length (); k++)
>          {
> -         gimple *next_phi = new_phis[k];
> -          tree second_vect = PHI_RESULT (next_phi);
> -         second_vect = gimple_convert (&stmts, vectype, second_vect);
> +         tree second_vect = gimple_convert (&stmts, vectype, reduc_inputs[k]);
>            first_vect = gimple_build (&stmts, code, vectype,
>                                      first_vect, second_vect);
>          }
>        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>
> -      new_phi_result = first_vect;
> -      new_phis.truncate (0);
> -      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
> +      reduc_inputs.truncate (0);
> +      reduc_inputs.safe_push (first_vect);
>      }
> -  /* Likewise if we couldn't use a single defuse cycle.  */
> -  else if (ncopies > 1)
> -    {
> -      gimple_seq stmts = NULL;
> -      tree first_vect = PHI_RESULT (new_phis[0]);
> -      first_vect = gimple_convert (&stmts, vectype, first_vect);
> -      for (int k = 1; k < ncopies; ++k)
> -       {
> -         tree second_vect = PHI_RESULT (new_phis[k]);
> -         second_vect = gimple_convert (&stmts, vectype, second_vect);
> -         first_vect = gimple_build (&stmts, code, vectype,
> -                                    first_vect, second_vect);
> -       }
> -      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> -      new_phi_result = first_vect;
> -      new_phis.truncate (0);
> -      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
> -    }
> -  else
> -    new_phi_result = PHI_RESULT (new_phis[0]);
>
>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
>        && reduc_fn != IFN_LAST)
>      {
> -      /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
> +      /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
>          various data values where the condition matched and another vector
>          (INDUCTION_INDEX) containing all the indexes of those matches.  We
>          need to extract the last matching index (which will be the index with
> @@ -5350,7 +5324,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        tree zero_vec = build_zero_cst (vectype);
>
>        gimple_seq stmts = NULL;
> -      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
> +      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
>        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>
>        /* Find maximum value from the vector of found indexes.  */
> @@ -5370,7 +5344,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>
>        /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
>          with the vector (INDUCTION_INDEX) of found indexes, choosing values
> -        from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
> +        from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
>          otherwise.  Only one value should match, resulting in a vector
>          (VEC_COND) with one data value and the rest zeros.
>          In the case where the loop never made any matches, every index will
> @@ -5389,7 +5363,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>          zero.  */
>        tree vec_cond = make_ssa_name (vectype);
>        gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
> -                                                  vec_compare, new_phi_result,
> +                                                  vec_compare,
> +                                                  reduc_inputs[0],
>                                                    zero_vec);
>        gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
>
> @@ -5437,7 +5412,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              val = data_reduc[i], idx_val = induction_index[i];
>          return val;  */
>
> -      tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
> +      tree data_eltype = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
>        tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
>        unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
>        poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
> @@ -5461,7 +5436,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
>                                              build3 (BIT_FIELD_REF,
>                                                      data_eltype,
> -                                                    new_phi_result,
> +                                                    reduc_inputs[0],
>                                                      bitsize_int (el_size),
>                                                      bitsize_int (off)));
>           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> @@ -5513,10 +5488,10 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>                          "Reduce using direct vector reduction.\n");
>
>        gimple_seq stmts = NULL;
> -      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
> -      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
> +      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
> +      vec_elem_type = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
>        new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
> -                              vec_elem_type, new_phi_result);
> +                              vec_elem_type, reduc_inputs[0]);
>        new_temp = gimple_convert (&stmts, scalar_type, new_temp);
>        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>
> @@ -5546,7 +5521,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>          neutral value.  We can then do a normal reduction on each vector.  */
>
>        /* Enforced by vectorizable_reduction.  */
> -      gcc_assert (new_phis.length () == 1);
> +      gcc_assert (reduc_inputs.length () == 1);
>        gcc_assert (pow2p_hwi (group_size));
>
>        slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
> @@ -5602,7 +5577,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>
>              sel[j] = (index[j] == i);
>
> -            which selects the elements of NEW_PHI_RESULT that should
> +            which selects the elements of REDUC_INPUTS[0] that should
>              be included in the result.  */
>           tree compare_val = build_int_cst (index_elt_type, i);
>           compare_val = build_vector_from_val (index_type, compare_val);
> @@ -5611,11 +5586,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>
>           /* Calculate the equivalent of:
>
> -            vec = seq ? new_phi_result : vector_identity;
> +            vec = seq ? reduc_inputs[0] : vector_identity;
>
>              VEC is now suitable for a full vector reduction.  */
>           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
> -                                  sel, new_phi_result, vector_identity);
> +                                  sel, reduc_inputs[0], vector_identity);
>
>           /* Do the reduction and convert it to the appropriate type.  */
>           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
> @@ -5630,7 +5605,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        bool reduce_with_shift;
>        tree vec_temp;
>
> -      gcc_assert (slp_reduc || new_phis.length () == 1);
> +      gcc_assert (slp_reduc || reduc_inputs.length () == 1);
>
>        /* See if the target wants to do the final (shift) reduction
>          in a vector mode of smaller size and first reduce upper/lower
> @@ -5640,7 +5615,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>        unsigned nunits1 = nunits;
>        if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
> -         && new_phis.length () == 1)
> +         && reduc_inputs.length () == 1)
>         {
>           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
>           /* For SLP reductions we have to make sure lanes match up, but
> @@ -5672,7 +5647,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>
>        /* First reduce the vector to the desired vector size we should
>          do shift reduction on by combining upper and lower halves.  */
> -      new_temp = new_phi_result;
> +      new_temp = reduc_inputs[0];
>        while (nunits > nunits1)
>         {
>           nunits /= 2;
> @@ -5751,7 +5726,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           new_temp = make_ssa_name (vectype1);
>           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
>           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> -         new_phis[0] = epilog_stmt;
> +         reduc_inputs[0] = new_temp;
>         }
>
>        if (reduce_with_shift && !slp_reduc)
> @@ -5832,13 +5807,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           int element_bitsize = tree_to_uhwi (bitsize);
>           tree compute_type = TREE_TYPE (vectype);
>           gimple_seq stmts = NULL;
> -          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
> +         FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
>              {
>                int bit_offset;
> -              if (gimple_code (new_phi) == GIMPLE_PHI)
> -                vec_temp = PHI_RESULT (new_phi);
> -              else
> -                vec_temp = gimple_assign_lhs (new_phi);
>               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
>                                        vec_temp, bitsize, bitsize_zero_node);
>
> @@ -5929,11 +5900,10 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gimple_seq stmts = NULL;
>        if (double_reduc)
>         {
> -          new_phi = new_phis[0];
>           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
>           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
>           new_temp = gimple_build (&stmts, code, vectype,
> -                                  PHI_RESULT (new_phi), adjustment_def);
> +                                  reduc_inputs[0], adjustment_def);
>         }
>        else
>         {
> @@ -5947,7 +5917,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        epilog_stmt = gimple_seq_last_stmt (stmts);
>        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>        scalar_results[0] = new_temp;
> -      new_phis[0] = epilog_stmt;
>      }
>
>    if (double_reduc)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 04/10] vect: Ensure reduc_inputs always have vectype
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (2 preceding siblings ...)
  2021-07-08 12:39 ` [PATCH 03/10] vect: Remove new_phis from Richard Sandiford
@ 2021-07-08 12:40 ` Richard Sandiford
  2021-07-08 13:01   ` Richard Biener
  2021-07-08 12:40 ` [PATCH 05/10] vect: Add a vect_phi_initial_value helper function Richard Sandiford
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:40 UTC (permalink / raw)
  To: gcc-patches

Vector reduction accumulators can differ in signedness from the
final scalar result.  The conversions to handle that case were
distributed through vect_create_epilog_for_reduction; this patch
does the conversion up-front instead.

gcc/
	* tree-vect-loop.c (vect_create_epilog_for_reduction): Convert
	the phi results to vectype after creating them.  Remove later
	conversion code that thus becomes redundant.
---
 gcc/tree-vect-loop.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b7f73ca52c7..1bd9a6ea52c 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5214,9 +5214,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   if (double_reduc)
     loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
+  exit_gsi = gsi_after_labels (exit_bb);
   reduc_inputs.create (slp_node ? vec_num : ncopies);
   for (unsigned i = 0; i < vec_num; i++)
     {
+      gimple_seq stmts = NULL;
       if (slp_node)
 	def = vect_get_slp_vect_def (slp_node, i);
       else
@@ -5228,12 +5230,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  if (j)
 	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
 	  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
+	  new_def = gimple_convert (&stmts, vectype, new_def);
 	  reduc_inputs.quick_push (new_def);
 	}
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
     }
 
-  exit_gsi = gsi_after_labels (exit_bb);
-
   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
          (i.e. when reduc_fn is not available) and in the final adjustment
 	 code (if needed).  Also get the original scalar reduction variable as
@@ -5277,17 +5279,14 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       || ncopies > 1)
     {
       gimple_seq stmts = NULL;
-      tree first_vect = gimple_convert (&stmts, vectype, reduc_inputs[0]);
+      tree single_input = reduc_inputs[0];
       for (k = 1; k < reduc_inputs.length (); k++)
-        {
-	  tree second_vect = gimple_convert (&stmts, vectype, reduc_inputs[k]);
-          first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-        }
+	single_input = gimple_build (&stmts, code, vectype,
+				     single_input, reduc_inputs[k]);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
       reduc_inputs.truncate (0);
-      reduc_inputs.safe_push (first_vect);
+      reduc_inputs.safe_push (single_input);
     }
 
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
@@ -5323,10 +5322,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Vector of {0, 0, 0,...}.  */
       tree zero_vec = build_zero_cst (vectype);
 
-      gimple_seq stmts = NULL;
-      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-
       /* Find maximum value from the vector of found indexes.  */
       tree max_index = make_ssa_name (index_scalar_type);
       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
@@ -5394,7 +5389,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Convert the reduced value back to the result type and set as the
 	 result.  */
-      stmts = NULL;
+      gimple_seq stmts = NULL;
       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
 			       data_reduc);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
@@ -5412,7 +5407,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     val = data_reduc[i], idx_val = induction_index[i];
 	 return val;  */
 
-      tree data_eltype = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
+      tree data_eltype = TREE_TYPE (vectype);
       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
@@ -5488,8 +5483,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 			 "Reduce using direct vector reduction.\n");
 
       gimple_seq stmts = NULL;
-      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
-      vec_elem_type = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
+      vec_elem_type = TREE_TYPE (vectype);
       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
 			       vec_elem_type, reduc_inputs[0]);
       new_temp = gimple_convert (&stmts, scalar_type, new_temp);

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] vect: Ensure reduc_inputs always have vectype
  2021-07-08 12:40 ` [PATCH 04/10] vect: Ensure reduc_inputs always have vectype Richard Sandiford
@ 2021-07-08 13:01   ` Richard Biener
  2021-07-13  9:26     ` Richard Sandiford
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Biener @ 2021-07-08 13:01 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:44 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Vector reduction accumulators can differ in signedness from the
> final scalar result.  The conversions to handle that case were
> distributed through vect_create_epilog_for_reduction; this patch
> does the conversion up-front instead.

But is that still correct?  The conversions should be unsigned -> signed,
that is, we've performed the reduction in unsigned because we associated
the originally undefined overflow signed reduction.  But the final
reduction of the vector lanes in the epilogue still needs to be done
unsigned.

So it's just not obvious that the patch preserves this - if it does then
the patch is OK.

Richard.

> gcc/
>         * tree-vect-loop.c (vect_create_epilog_for_reduction): Convert
>         the phi results to vectype after creating them.  Remove later
>         conversion code that thus becomes redundant.
> ---
>  gcc/tree-vect-loop.c | 28 +++++++++++-----------------
>  1 file changed, 11 insertions(+), 17 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index b7f73ca52c7..1bd9a6ea52c 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -5214,9 +5214,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    if (double_reduc)
>      loop = outer_loop;
>    exit_bb = single_exit (loop)->dest;
> +  exit_gsi = gsi_after_labels (exit_bb);
>    reduc_inputs.create (slp_node ? vec_num : ncopies);
>    for (unsigned i = 0; i < vec_num; i++)
>      {
> +      gimple_seq stmts = NULL;
>        if (slp_node)
>         def = vect_get_slp_vect_def (slp_node, i);
>        else
> @@ -5228,12 +5230,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           if (j)
>             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
>           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
> +         new_def = gimple_convert (&stmts, vectype, new_def);
>           reduc_inputs.quick_push (new_def);
>         }
> +      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>      }
>
> -  exit_gsi = gsi_after_labels (exit_bb);
> -
>    /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
>           (i.e. when reduc_fn is not available) and in the final adjustment
>          code (if needed).  Also get the original scalar reduction variable as
> @@ -5277,17 +5279,14 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        || ncopies > 1)
>      {
>        gimple_seq stmts = NULL;
> -      tree first_vect = gimple_convert (&stmts, vectype, reduc_inputs[0]);
> +      tree single_input = reduc_inputs[0];
>        for (k = 1; k < reduc_inputs.length (); k++)
> -        {
> -         tree second_vect = gimple_convert (&stmts, vectype, reduc_inputs[k]);
> -          first_vect = gimple_build (&stmts, code, vectype,
> -                                    first_vect, second_vect);
> -        }
> +       single_input = gimple_build (&stmts, code, vectype,
> +                                    single_input, reduc_inputs[k]);
>        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>
>        reduc_inputs.truncate (0);
> -      reduc_inputs.safe_push (first_vect);
> +      reduc_inputs.safe_push (single_input);
>      }
>
>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
> @@ -5323,10 +5322,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        /* Vector of {0, 0, 0,...}.  */
>        tree zero_vec = build_zero_cst (vectype);
>
> -      gimple_seq stmts = NULL;
> -      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
> -      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> -
>        /* Find maximum value from the vector of found indexes.  */
>        tree max_index = make_ssa_name (index_scalar_type);
>        gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
> @@ -5394,7 +5389,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>
>        /* Convert the reduced value back to the result type and set as the
>          result.  */
> -      stmts = NULL;
> +      gimple_seq stmts = NULL;
>        new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
>                                data_reduc);
>        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> @@ -5412,7 +5407,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              val = data_reduc[i], idx_val = induction_index[i];
>          return val;  */
>
> -      tree data_eltype = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
> +      tree data_eltype = TREE_TYPE (vectype);
>        tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
>        unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
>        poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
> @@ -5488,8 +5483,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>                          "Reduce using direct vector reduction.\n");
>
>        gimple_seq stmts = NULL;
> -      reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]);
> -      vec_elem_type = TREE_TYPE (TREE_TYPE (reduc_inputs[0]));
> +      vec_elem_type = TREE_TYPE (vectype);
>        new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
>                                vec_elem_type, reduc_inputs[0]);
>        new_temp = gimple_convert (&stmts, scalar_type, new_temp);

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] vect: Ensure reduc_inputs always have vectype
  2021-07-08 13:01   ` Richard Biener
@ 2021-07-13  9:26     ` Richard Sandiford
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Sandiford @ 2021-07-13  9:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
> On Thu, Jul 8, 2021 at 2:44 PM Richard Sandiford via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> Vector reduction accumulators can differ in signedness from the
>> final scalar result.  The conversions to handle that case were
>> distributed through vect_create_epilog_for_reduction; this patch
>> does the conversion up-front instead.
>
> But is that still correct?  The conversions should be unsigned -> signed,
> that is, we've performed the reduction in unsigned because we associated
> the originally undefined overflow signed reduction.  But the final
> reduction of the vector lanes in the epilogue still needs to be done
> unsigned.
>
> So it's just not obvious that the patch preserves this - if it does then
> the patch is OK.

We ended up covering most of this in the later 6/10 thread, but just to
follow up here for the record, in case anyone looks at the list archives:

In that scenario, the phis are created with the signed type and then
(like you say) the reduction happens in the unsigned type.  These
conversions are from the signed type to the unsigned type ready for
the reduction.

All later code either performed the conversion itself or (in the
case of some of the cond reductions) required the phi and reduction
vectypes to be the same.

I've pushed the series now -- thanks for the reviews.

Richard

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 05/10] vect: Add a vect_phi_initial_value helper function
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (3 preceding siblings ...)
  2021-07-08 12:40 ` [PATCH 04/10] vect: Ensure reduc_inputs always have vectype Richard Sandiford
@ 2021-07-08 12:40 ` Richard Sandiford
  2021-07-08 13:05   ` Richard Biener
  2021-07-08 12:40 ` [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction Richard Sandiford
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:40 UTC (permalink / raw)
  To: gcc-patches

This patch adds a helper function called vect_phi_initial_value
for returning the incoming value of a given loop phi.  The main
reason for adding it is to ensure that the right preheader edge
is used when vectorising nested loops.  (PHI_ARG_DEF_FROM_EDGE
itself doesn't assert that the given edge is for the right block,
although I guess that would be good to add separately.)

gcc/
	* tree-vectorizer.h: Include tree-ssa-operands.h.
	(vect_phi_initial_value): New function.
	* tree-vect-loop.c (neutral_op_for_slp_reduction): Use it.
	(get_initial_defs_for_reduction, info_for_reduction): Likewise.
	(vect_create_epilog_for_reduction, vectorizable_reduction): Likewise.
	(vect_transform_cycle_phi, vectorizable_induction): Likewise.
---
 gcc/tree-vect-loop.c  | 29 +++++++++--------------------
 gcc/tree-vectorizer.h | 21 ++++++++++++++++++++-
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 1bd9a6ea52c..a31d7621c3b 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3288,8 +3288,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
 	 has only a single initial value, so that value is neutral for
 	 all statements.  */
       if (reduc_chain)
-	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
-				      loop_preheader_edge (loop));
+	return vect_phi_initial_value (stmt_vinfo);
       return NULL_TREE;
 
     default:
@@ -4829,13 +4828,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
       /* Get the def before the loop.  In reduction chain we have only
 	 one initial value.  Else we have as many as PHIs in the group.  */
       if (reduc_chain)
-	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+	op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
       else if (((vec_oprnds->length () + 1) * nunits
 		- number_of_places_left_in_vector >= group_size)
 	       && neutral_op)
 	op = neutral_op;
       else
-	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+	op = vect_phi_initial_value (stmt_vinfo);
 
       /* Create 'vect_ = {op0,op1,...,opn}'.  */
       number_of_places_left_in_vector--;
@@ -4906,9 +4905,7 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
     }
   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
-      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
-      stmt_vec_info info
-	  = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
+      stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
 	stmt_info = info;
     }
@@ -5042,8 +5039,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       /* Get at the scalar def before the loop, that defines the initial value
 	 of the reduction variable.  */
-      initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
-					   loop_preheader_edge (loop));
+      initial_def = vect_phi_initial_value (reduc_def_stmt);
       /* Optimize: for induction condition reduction, if we can't use zero
          for induc_val, use initial_def.  */
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
@@ -5558,9 +5554,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     for MIN and MAX reduction, for example.  */
 	  if (!neutral_op)
 	    {
-	      tree scalar_value
-		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
-					 loop_preheader_edge (loop));
+	      tree scalar_value = vect_phi_initial_value (orig_phis[i]);
 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
 					     scalar_value);
 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6752,10 +6746,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       else if (cond_reduc_dt == vect_constant_def)
 	{
 	  enum vect_def_type cond_initial_dt;
-	  tree cond_initial_val
-	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
-
-	  gcc_assert (cond_reduc_val != NULL_TREE);
+	  tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
 	  if (cond_initial_dt == vect_constant_def
 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
@@ -7528,8 +7519,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
     {
       /* Get at the scalar def before the loop, that defines the initial
 	 value of the reduction variable.  */
-      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
-						loop_preheader_edge (loop));
+      tree initial_def = vect_phi_initial_value (phi);
       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
 	 and we can't use zero for induc_val, use initial_def.  Similarly
 	 for REDUC_MIN and initial_def larger than the base.  */
@@ -8175,8 +8165,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       return true;
     }
 
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-				     loop_preheader_edge (iv_loop));
+  init_expr = vect_phi_initial_value (phi);
 
   gimple_seq stmts = NULL;
   if (!nested_in_vect_loop)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fa28336d429..e2fd3609fee 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -27,7 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
 #include "tree-hash-traits.h"
 #include "target.h"
 #include "internal-fn.h"
-
+#include "tree-ssa-operands.h"
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -1369,6 +1369,25 @@ nested_in_vect_loop_p (class loop *loop, stmt_vec_info stmt_info)
 	  && (loop->inner == (gimple_bb (stmt_info->stmt))->loop_father));
 }
 
+/* PHI is either a scalar reduction phi or a scalar induction phi.
+   Return the initial value of the variable on entry to the containing
+   loop.  */
+
+static inline tree
+vect_phi_initial_value (gphi *phi)
+{
+  basic_block bb = gimple_bb (phi);
+  edge pe = loop_preheader_edge (bb->loop_father);
+  gcc_assert (pe->dest == bb);
+  return PHI_ARG_DEF_FROM_EDGE (phi, pe);
+}
+
+static inline tree
+vect_phi_initial_value (stmt_vec_info stmt_info)
+{
+  return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
+}
+
 /* Return true if STMT_INFO should produce a vector mask type rather than
    a normal nonmask type.  */
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 05/10] vect: Add a vect_phi_initial_value helper function
  2021-07-08 12:40 ` [PATCH 05/10] vect: Add a vect_phi_initial_value helper function Richard Sandiford
@ 2021-07-08 13:05   ` Richard Biener
  2021-07-08 13:12     ` Richard Sandiford
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Biener @ 2021-07-08 13:05 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:45 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This patch adds a helper function called vect_phi_initial_value
> for returning the incoming value of a given loop phi.  The main
> reason for adding it is to ensure that the right preheader edge
> is used when vectorising nested loops.  (PHI_ARG_DEF_FROM_EDGE
> itself doesn't assert that the given edge is for the right block,
> although I guess that would be good to add separately.)

We were sometimes (most of the time?) using an explicit
loop where you now get it from the PHI - that makes the
assert somewhat pointless to some extent - of course it
makes sense on its own that the loop is the same as that
of the PHI def.  I just wonder if you think any of the existing
code might have been wrong?  If so the new assert doesn't
catch all originally wrong cases.

Otherwise OK,
Richard.

> gcc/
>         * tree-vectorizer.h: Include tree-ssa-operands.h.
>         (vect_phi_initial_value): New function.
>         * tree-vect-loop.c (neutral_op_for_slp_reduction): Use it.
>         (get_initial_defs_for_reduction, info_for_reduction): Likewise.
>         (vect_create_epilog_for_reduction, vectorizable_reduction): Likewise.
>         (vect_transform_cycle_phi, vectorizable_induction): Likewise.
> ---
>  gcc/tree-vect-loop.c  | 29 +++++++++--------------------
>  gcc/tree-vectorizer.h | 21 ++++++++++++++++++++-
>  2 files changed, 29 insertions(+), 21 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 1bd9a6ea52c..a31d7621c3b 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -3288,8 +3288,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
>          has only a single initial value, so that value is neutral for
>          all statements.  */
>        if (reduc_chain)
> -       return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
> -                                     loop_preheader_edge (loop));
> +       return vect_phi_initial_value (stmt_vinfo);
>        return NULL_TREE;
>
>      default:
> @@ -4829,13 +4828,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>        /* Get the def before the loop.  In reduction chain we have only
>          one initial value.  Else we have as many as PHIs in the group.  */
>        if (reduc_chain)
> -       op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
> +       op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
>        else if (((vec_oprnds->length () + 1) * nunits
>                 - number_of_places_left_in_vector >= group_size)
>                && neutral_op)
>         op = neutral_op;
>        else
> -       op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
> +       op = vect_phi_initial_value (stmt_vinfo);
>
>        /* Create 'vect_ = {op0,op1,...,opn}'.  */
>        number_of_places_left_in_vector--;
> @@ -4906,9 +4905,7 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
>      }
>    else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
>      {
> -      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
> -      stmt_vec_info info
> -         = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
> +      stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
>        if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
>         stmt_info = info;
>      }
> @@ -5042,8 +5039,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>      {
>        /* Get at the scalar def before the loop, that defines the initial value
>          of the reduction variable.  */
> -      initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
> -                                          loop_preheader_edge (loop));
> +      initial_def = vect_phi_initial_value (reduc_def_stmt);
>        /* Optimize: for induction condition reduction, if we can't use zero
>           for induc_val, use initial_def.  */
>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
> @@ -5558,9 +5554,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              for MIN and MAX reduction, for example.  */
>           if (!neutral_op)
>             {
> -             tree scalar_value
> -               = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
> -                                        loop_preheader_edge (loop));
> +             tree scalar_value = vect_phi_initial_value (orig_phis[i]);
>               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
>                                              scalar_value);
>               vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6752,10 +6746,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        else if (cond_reduc_dt == vect_constant_def)
>         {
>           enum vect_def_type cond_initial_dt;
> -         tree cond_initial_val
> -           = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
> -
> -         gcc_assert (cond_reduc_val != NULL_TREE);
> +         tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
>           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
>           if (cond_initial_dt == vect_constant_def
>               && types_compatible_p (TREE_TYPE (cond_initial_val),
> @@ -7528,8 +7519,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>      {
>        /* Get at the scalar def before the loop, that defines the initial
>          value of the reduction variable.  */
> -      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
> -                                               loop_preheader_edge (loop));
> +      tree initial_def = vect_phi_initial_value (phi);
>        /* Optimize: if initial_def is for REDUC_MAX smaller than the base
>          and we can't use zero for induc_val, use initial_def.  Similarly
>          for REDUC_MIN and initial_def larger than the base.  */
> @@ -8175,8 +8165,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>        return true;
>      }
>
> -  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
> -                                    loop_preheader_edge (iv_loop));
> +  init_expr = vect_phi_initial_value (phi);
>
>    gimple_seq stmts = NULL;
>    if (!nested_in_vect_loop)
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index fa28336d429..e2fd3609fee 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -27,7 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
>  #include "tree-hash-traits.h"
>  #include "target.h"
>  #include "internal-fn.h"
> -
> +#include "tree-ssa-operands.h"
>
>  /* Used for naming of new temporaries.  */
>  enum vect_var_kind {
> @@ -1369,6 +1369,25 @@ nested_in_vect_loop_p (class loop *loop, stmt_vec_info stmt_info)
>           && (loop->inner == (gimple_bb (stmt_info->stmt))->loop_father));
>  }
>
> +/* PHI is either a scalar reduction phi or a scalar induction phi.
> +   Return the initial value of the variable on entry to the containing
> +   loop.  */
> +
> +static inline tree
> +vect_phi_initial_value (gphi *phi)
> +{
> +  basic_block bb = gimple_bb (phi);
> +  edge pe = loop_preheader_edge (bb->loop_father);
> +  gcc_assert (pe->dest == bb);
> +  return PHI_ARG_DEF_FROM_EDGE (phi, pe);
> +}
> +
> +static inline tree
> +vect_phi_initial_value (stmt_vec_info stmt_info)
> +{
> +  return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
> +}
> +
>  /* Return true if STMT_INFO should produce a vector mask type rather than
>     a normal nonmask type.  */
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 05/10] vect: Add a vect_phi_initial_value helper function
  2021-07-08 13:05   ` Richard Biener
@ 2021-07-08 13:12     ` Richard Sandiford
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 13:12 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
> On Thu, Jul 8, 2021 at 2:45 PM Richard Sandiford via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> This patch adds a helper function called vect_phi_initial_value
>> for returning the incoming value of a given loop phi.  The main
>> reason for adding it is to ensure that the right preheader edge
>> is used when vectorising nested loops.  (PHI_ARG_DEF_FROM_EDGE
>> itself doesn't assert that the given edge is for the right block,
>> although I guess that would be good to add separately.)
>
> We were sometimes (most of the time?) using an explicit
> loop where you now get it from the PHI - that makes the
> assert somewhat pointless to some extent - of course it
> makes sense on its own that the loop is the same as that
> of the PHI def.  I just wonder if you think any of the existing
> code might have been wrong?  If so the new assert doesn't
> catch all originally wrong cases.

I don't remember seeing a case where the existing code got it wrong,
but I think one of the patches in the series did initially use the
wrong loop's preheader.

But yeah, the function and assert only help to avoid using
PHI_ARG_DEF_FROM_EDGE with the wrong edge.  If the problem was instead
passing the wrong phi then the patch doesn't help to catch that.

The edge mistake is more likely to be a silent failure though,
since the edge indices for both loops might happen to be the same
(but might not).

Thanks,
Richard

>
> Otherwise OK,
> Richard.
>
>> gcc/
>>         * tree-vectorizer.h: Include tree-ssa-operands.h.
>>         (vect_phi_initial_value): New function.
>>         * tree-vect-loop.c (neutral_op_for_slp_reduction): Use it.
>>         (get_initial_defs_for_reduction, info_for_reduction): Likewise.
>>         (vect_create_epilog_for_reduction, vectorizable_reduction): Likewise.
>>         (vect_transform_cycle_phi, vectorizable_induction): Likewise.
>> ---
>>  gcc/tree-vect-loop.c  | 29 +++++++++--------------------
>>  gcc/tree-vectorizer.h | 21 ++++++++++++++++++++-
>>  2 files changed, 29 insertions(+), 21 deletions(-)
>>
>> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> index 1bd9a6ea52c..a31d7621c3b 100644
>> --- a/gcc/tree-vect-loop.c
>> +++ b/gcc/tree-vect-loop.c
>> @@ -3288,8 +3288,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
>>          has only a single initial value, so that value is neutral for
>>          all statements.  */
>>        if (reduc_chain)
>> -       return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
>> -                                     loop_preheader_edge (loop));
>> +       return vect_phi_initial_value (stmt_vinfo);
>>        return NULL_TREE;
>>
>>      default:
>> @@ -4829,13 +4828,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>>        /* Get the def before the loop.  In reduction chain we have only
>>          one initial value.  Else we have as many as PHIs in the group.  */
>>        if (reduc_chain)
>> -       op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
>> +       op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
>>        else if (((vec_oprnds->length () + 1) * nunits
>>                 - number_of_places_left_in_vector >= group_size)
>>                && neutral_op)
>>         op = neutral_op;
>>        else
>> -       op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
>> +       op = vect_phi_initial_value (stmt_vinfo);
>>
>>        /* Create 'vect_ = {op0,op1,...,opn}'.  */
>>        number_of_places_left_in_vector--;
>> @@ -4906,9 +4905,7 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
>>      }
>>    else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
>>      {
>> -      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
>> -      stmt_vec_info info
>> -         = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
>> +      stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
>>        if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
>>         stmt_info = info;
>>      }
>> @@ -5042,8 +5039,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>>      {
>>        /* Get at the scalar def before the loop, that defines the initial value
>>          of the reduction variable.  */
>> -      initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
>> -                                          loop_preheader_edge (loop));
>> +      initial_def = vect_phi_initial_value (reduc_def_stmt);
>>        /* Optimize: for induction condition reduction, if we can't use zero
>>           for induc_val, use initial_def.  */
>>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
>> @@ -5558,9 +5554,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>>              for MIN and MAX reduction, for example.  */
>>           if (!neutral_op)
>>             {
>> -             tree scalar_value
>> -               = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
>> -                                        loop_preheader_edge (loop));
>> +             tree scalar_value = vect_phi_initial_value (orig_phis[i]);
>>               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
>>                                              scalar_value);
>>               vector_identity = gimple_build_vector_from_val (&seq, vectype,
>> @@ -6752,10 +6746,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>        else if (cond_reduc_dt == vect_constant_def)
>>         {
>>           enum vect_def_type cond_initial_dt;
>> -         tree cond_initial_val
>> -           = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
>> -
>> -         gcc_assert (cond_reduc_val != NULL_TREE);
>> +         tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
>>           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
>>           if (cond_initial_dt == vect_constant_def
>>               && types_compatible_p (TREE_TYPE (cond_initial_val),
>> @@ -7528,8 +7519,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>>      {
>>        /* Get at the scalar def before the loop, that defines the initial
>>          value of the reduction variable.  */
>> -      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
>> -                                               loop_preheader_edge (loop));
>> +      tree initial_def = vect_phi_initial_value (phi);
>>        /* Optimize: if initial_def is for REDUC_MAX smaller than the base
>>          and we can't use zero for induc_val, use initial_def.  Similarly
>>          for REDUC_MIN and initial_def larger than the base.  */
>> @@ -8175,8 +8165,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>        return true;
>>      }
>>
>> -  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
>> -                                    loop_preheader_edge (iv_loop));
>> +  init_expr = vect_phi_initial_value (phi);
>>
>>    gimple_seq stmts = NULL;
>>    if (!nested_in_vect_loop)
>> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> index fa28336d429..e2fd3609fee 100644
>> --- a/gcc/tree-vectorizer.h
>> +++ b/gcc/tree-vectorizer.h
>> @@ -27,7 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
>>  #include "tree-hash-traits.h"
>>  #include "target.h"
>>  #include "internal-fn.h"
>> -
>> +#include "tree-ssa-operands.h"
>>
>>  /* Used for naming of new temporaries.  */
>>  enum vect_var_kind {
>> @@ -1369,6 +1369,25 @@ nested_in_vect_loop_p (class loop *loop, stmt_vec_info stmt_info)
>>           && (loop->inner == (gimple_bb (stmt_info->stmt))->loop_father));
>>  }
>>
>> +/* PHI is either a scalar reduction phi or a scalar induction phi.
>> +   Return the initial value of the variable on entry to the containing
>> +   loop.  */
>> +
>> +static inline tree
>> +vect_phi_initial_value (gphi *phi)
>> +{
>> +  basic_block bb = gimple_bb (phi);
>> +  edge pe = loop_preheader_edge (bb->loop_father);
>> +  gcc_assert (pe->dest == bb);
>> +  return PHI_ARG_DEF_FROM_EDGE (phi, pe);
>> +}
>> +
>> +static inline tree
>> +vect_phi_initial_value (stmt_vec_info stmt_info)
>> +{
>> +  return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
>> +}
>> +
>>  /* Return true if STMT_INFO should produce a vector mask type rather than
>>     a normal nonmask type.  */
>>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (4 preceding siblings ...)
  2021-07-08 12:40 ` [PATCH 05/10] vect: Add a vect_phi_initial_value helper function Richard Sandiford
@ 2021-07-08 12:40 ` Richard Sandiford
  2021-07-08 13:10   ` Richard Biener
  2021-07-08 12:41 ` [PATCH 07/10] vect: Pass reduc_info to get_initial_def_for_reduction Richard Sandiford
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:40 UTC (permalink / raw)
  To: gcc-patches

This patch passes the reduc_info to get_initial_defs_for_reduction,
so that the function can get general information from there rather
than from the first SLP statement.  This isn't a win on its own,
but it becomes important with later patches.

gcc/
	* tree-vect-loop.c (get_initial_defs_for_reduction): Take the
	reduc_info as an additional parameter.
	(vect_transform_cycle_phi): Update accordingly.
---
 gcc/tree-vect-loop.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index a31d7621c3b..565c2859477 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -4764,32 +4764,28 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
   return init_def;
 }
 
-/* Get at the initial defs for the reduction PHIs in SLP_NODE.
-   NUMBER_OF_VECTORS is the number of vector defs to create.
-   If NEUTRAL_OP is nonnull, introducing extra elements of that
-   value will not change the result.  */
+/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
+   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
+   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
+   that value will not change the result.  */
 
 static void
 get_initial_defs_for_reduction (vec_info *vinfo,
+				stmt_vec_info reduc_info,
 				slp_tree slp_node,
 				vec<tree> *vec_oprnds,
 				unsigned int number_of_vectors,
 				bool reduc_chain, tree neutral_op)
 {
   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
-  tree vector_type;
+  tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
   unsigned int group_size = stmts.length ();
   unsigned int i;
   class loop *loop;
 
-  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
-
-  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
-
-  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
+  loop = (gimple_bb (reduc_info->stmt))->loop_father;
   gcc_assert (loop);
   edge pe = loop_preheader_edge (loop);
 
@@ -4823,7 +4819,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
     {
       tree op;
       i = j % group_size;
-      stmt_vinfo = stmts[i];
+      stmt_vec_info stmt_vinfo = stmts[i];
 
       /* Get the def before the loop.  In reduction chain we have only
 	 one initial value.  Else we have as many as PHIs in the group.  */
@@ -7510,7 +7506,8 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	      = neutral_op_for_slp_reduction (slp_node, vectype_out,
 					      STMT_VINFO_REDUC_CODE (reduc_info),
 					      first != NULL);
-	  get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
+	  get_initial_defs_for_reduction (loop_vinfo, reduc_info,
+					  slp_node_instance->reduc_phis,
 					  &vec_initial_defs, vec_num,
 					  first != NULL, neutral_op);
 	}

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction
  2021-07-08 12:40 ` [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction Richard Sandiford
@ 2021-07-08 13:10   ` Richard Biener
  2021-07-08 16:48     ` Richard Sandiford
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Biener @ 2021-07-08 13:10 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:46 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This patch passes the reduc_info to get_initial_defs_for_reduction,
> so that the function can get general information from there rather
> than from the first SLP statement.  This isn't a win on its own,
> but it becomes important with later patches.

So the original code should have used SLP_TREE_REPRESENTATIVE
instead of SLP_TREE_SCALAR_STMTS ()[0] (there might have been
issues with doing that - my recollection is weak here).

I'm not sure if reduc_info is actually better - only the representative
will have STMT_VINFO_VECTYPE set, for the reduc_info
there's STMT_VINFO_REDUC_VECTYPE (and STMT_VINFO_REDUC_VECTYPE_IN).

So I think if you want to use reduc_info then you want to use
STMT_VINFO_REDUC_VECTYPE?

> gcc/
>         * tree-vect-loop.c (get_initial_defs_for_reduction): Take the
>         reduc_info as an additional parameter.
>         (vect_transform_cycle_phi): Update accordingly.
> ---
>  gcc/tree-vect-loop.c | 23 ++++++++++-------------
>  1 file changed, 10 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index a31d7621c3b..565c2859477 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -4764,32 +4764,28 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
>    return init_def;
>  }
>
> -/* Get at the initial defs for the reduction PHIs in SLP_NODE.
> -   NUMBER_OF_VECTORS is the number of vector defs to create.
> -   If NEUTRAL_OP is nonnull, introducing extra elements of that
> -   value will not change the result.  */
> +/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
> +   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
> +   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
> +   that value will not change the result.  */
>
>  static void
>  get_initial_defs_for_reduction (vec_info *vinfo,
> +                               stmt_vec_info reduc_info,
>                                 slp_tree slp_node,
>                                 vec<tree> *vec_oprnds,
>                                 unsigned int number_of_vectors,
>                                 bool reduc_chain, tree neutral_op)
>  {
>    vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> -  stmt_vec_info stmt_vinfo = stmts[0];
>    unsigned HOST_WIDE_INT nunits;
>    unsigned j, number_of_places_left_in_vector;
> -  tree vector_type;
> +  tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
>    unsigned int group_size = stmts.length ();
>    unsigned int i;
>    class loop *loop;
>
> -  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
> -
> -  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
> -
> -  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
> +  loop = (gimple_bb (reduc_info->stmt))->loop_father;
>    gcc_assert (loop);
>    edge pe = loop_preheader_edge (loop);
>
> @@ -4823,7 +4819,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>      {
>        tree op;
>        i = j % group_size;
> -      stmt_vinfo = stmts[i];
> +      stmt_vec_info stmt_vinfo = stmts[i];
>
>        /* Get the def before the loop.  In reduction chain we have only
>          one initial value.  Else we have as many as PHIs in the group.  */
> @@ -7510,7 +7506,8 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>               = neutral_op_for_slp_reduction (slp_node, vectype_out,
>                                               STMT_VINFO_REDUC_CODE (reduc_info),
>                                               first != NULL);
> -         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
> +         get_initial_defs_for_reduction (loop_vinfo, reduc_info,
> +                                         slp_node_instance->reduc_phis,
>                                           &vec_initial_defs, vec_num,
>                                           first != NULL, neutral_op);
>         }

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction
  2021-07-08 13:10   ` Richard Biener
@ 2021-07-08 16:48     ` Richard Sandiford
  2021-07-09 11:33       ` Richard Biener
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 16:48 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
> On Thu, Jul 8, 2021 at 2:46 PM Richard Sandiford via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> This patch passes the reduc_info to get_initial_defs_for_reduction,
>> so that the function can get general information from there rather
>> than from the first SLP statement.  This isn't a win on its own,
>> but it becomes important with later patches.
>
> So the original code should have used SLP_TREE_REPRESENTATIVE
> instead of SLP_TREE_SCALAR_STMTS ()[0] (there might have been
> issues with doing that - my recollection is weak here).
>
> I'm not sure if reduc_info is actually better - only the representative
> will have STMT_VINFO_VECTYPE set, for the reduc_info
> there's STMT_VINFO_REDUC_VECTYPE (and STMT_VINFO_REDUC_VECTYPE_IN).
>
> So I think if you want to use reduc_info then you want to use
> STMT_VINFO_REDUC_VECTYPE?

I guess I'm a bit fuzzy on the details, but AIUI STMT_VINFO_REDUC_VECTYPE
is the type that we do the arithmetic in, which might be different from
the types of the phis.  Is that right?

In this context we want the types of the phis, since the routine is
providing the initial values.  Using STMT_VINFO_REDUC_VECTYPE gives
things like:

-----------------------------------------------------------------------
gcc.dg/torture/pr92345.c:8:1: error: incompatible types in 'PHI' argument 1
vector(4) int

vector(4) unsigned int

vect_fr_lsm.11_58 = PHI <vect__7.14_64(6), { 0, 0, 0, 0 }(10)>
-----------------------------------------------------------------------

Thanks,
Richard

>
>> gcc/
>>         * tree-vect-loop.c (get_initial_defs_for_reduction): Take the
>>         reduc_info as an additional parameter.
>>         (vect_transform_cycle_phi): Update accordingly.
>> ---
>>  gcc/tree-vect-loop.c | 23 ++++++++++-------------
>>  1 file changed, 10 insertions(+), 13 deletions(-)
>>
>> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> index a31d7621c3b..565c2859477 100644
>> --- a/gcc/tree-vect-loop.c
>> +++ b/gcc/tree-vect-loop.c
>> @@ -4764,32 +4764,28 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
>>    return init_def;
>>  }
>>
>> -/* Get at the initial defs for the reduction PHIs in SLP_NODE.
>> -   NUMBER_OF_VECTORS is the number of vector defs to create.
>> -   If NEUTRAL_OP is nonnull, introducing extra elements of that
>> -   value will not change the result.  */
>> +/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
>> +   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
>> +   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
>> +   that value will not change the result.  */
>>
>>  static void
>>  get_initial_defs_for_reduction (vec_info *vinfo,
>> +                               stmt_vec_info reduc_info,
>>                                 slp_tree slp_node,
>>                                 vec<tree> *vec_oprnds,
>>                                 unsigned int number_of_vectors,
>>                                 bool reduc_chain, tree neutral_op)
>>  {
>>    vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
>> -  stmt_vec_info stmt_vinfo = stmts[0];
>>    unsigned HOST_WIDE_INT nunits;
>>    unsigned j, number_of_places_left_in_vector;
>> -  tree vector_type;
>> +  tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
>>    unsigned int group_size = stmts.length ();
>>    unsigned int i;
>>    class loop *loop;
>>
>> -  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
>> -
>> -  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
>> -
>> -  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
>> +  loop = (gimple_bb (reduc_info->stmt))->loop_father;
>>    gcc_assert (loop);
>>    edge pe = loop_preheader_edge (loop);
>>
>> @@ -4823,7 +4819,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>>      {
>>        tree op;
>>        i = j % group_size;
>> -      stmt_vinfo = stmts[i];
>> +      stmt_vec_info stmt_vinfo = stmts[i];
>>
>>        /* Get the def before the loop.  In reduction chain we have only
>>          one initial value.  Else we have as many as PHIs in the group.  */
>> @@ -7510,7 +7506,8 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>>               = neutral_op_for_slp_reduction (slp_node, vectype_out,
>>                                               STMT_VINFO_REDUC_CODE (reduc_info),
>>                                               first != NULL);
>> -         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
>> +         get_initial_defs_for_reduction (loop_vinfo, reduc_info,
>> +                                         slp_node_instance->reduc_phis,
>>                                           &vec_initial_defs, vec_num,
>>                                           first != NULL, neutral_op);
>>         }

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction
  2021-07-08 16:48     ` Richard Sandiford
@ 2021-07-09 11:33       ` Richard Biener
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Biener @ 2021-07-09 11:33 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, Richard Sandiford

On Thu, Jul 8, 2021 at 6:48 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Thu, Jul 8, 2021 at 2:46 PM Richard Sandiford via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> This patch passes the reduc_info to get_initial_defs_for_reduction,
> >> so that the function can get general information from there rather
> >> than from the first SLP statement.  This isn't a win on its own,
> >> but it becomes important with later patches.
> >
> > So the original code should have used SLP_TREE_REPRESENTATIVE
> > instead of SLP_TREE_SCALAR_STMTS ()[0] (there might have been
> > issues with doing that - my recollection is weak here).
> >
> > I'm not sure if reduc_info is actually better - only the representative
> > will have STMT_VINFO_VECTYPE set, for the reduc_info
> > there's STMT_VINFO_REDUC_VECTYPE (and STMT_VINFO_REDUC_VECTYPE_IN).
> >
> > So I think if you want to use reduc_info then you want to use
> > STMT_VINFO_REDUC_VECTYPE?
>
> I guess I'm a bit fuzzy on the details, but AIUI STMT_VINFO_REDUC_VECTYPE
> is the type that we do the arithmetic in, which might be different from
> the types of the phis.  Is that right?

Hmm, yeah (my recollection is fuzzy as well here...).

> In this context we want the types of the phis, since the routine is
> providing the initial values.  Using STMT_VINFO_REDUC_VECTYPE gives
> things like:

OK, I see.  So there's the reduc_info vs. SLP_TREE_REPRESENTATIVE issue
left.  At least I don't see that we reliably set STMT_VINFO_VECTYPE on
all scalar PHIs of a SLP reduction.  The reduc_info happens to be one of the
PHI stmt_infos (but that's an implementation detail as well).

The reduction SLP instance has the reduc_phis member to get at the
PHIs vector type (via SLP_TREE_VECTYPE).  I think we don't have
anything explicit that's good here but I notice that
vect_create_epilog_for_reduction
uses STMT_VINFO_VECTYPE (reduc_info) as well.

So I guess the patch is OK as-is.

Thanks,
Richard.


> -----------------------------------------------------------------------
> gcc.dg/torture/pr92345.c:8:1: error: incompatible types in 'PHI' argument 1
> vector(4) int
>
> vector(4) unsigned int
>
> vect_fr_lsm.11_58 = PHI <vect__7.14_64(6), { 0, 0, 0, 0 }(10)>
> -----------------------------------------------------------------------
>
> Thanks,
> Richard
>
> >
> >> gcc/
> >>         * tree-vect-loop.c (get_initial_defs_for_reduction): Take the
> >>         reduc_info as an additional parameter.
> >>         (vect_transform_cycle_phi): Update accordingly.
> >> ---
> >>  gcc/tree-vect-loop.c | 23 ++++++++++-------------
> >>  1 file changed, 10 insertions(+), 13 deletions(-)
> >>
> >> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> >> index a31d7621c3b..565c2859477 100644
> >> --- a/gcc/tree-vect-loop.c
> >> +++ b/gcc/tree-vect-loop.c
> >> @@ -4764,32 +4764,28 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
> >>    return init_def;
> >>  }
> >>
> >> -/* Get at the initial defs for the reduction PHIs in SLP_NODE.
> >> -   NUMBER_OF_VECTORS is the number of vector defs to create.
> >> -   If NEUTRAL_OP is nonnull, introducing extra elements of that
> >> -   value will not change the result.  */
> >> +/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
> >> +   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
> >> +   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
> >> +   that value will not change the result.  */
> >>
> >>  static void
> >>  get_initial_defs_for_reduction (vec_info *vinfo,
> >> +                               stmt_vec_info reduc_info,
> >>                                 slp_tree slp_node,
> >>                                 vec<tree> *vec_oprnds,
> >>                                 unsigned int number_of_vectors,
> >>                                 bool reduc_chain, tree neutral_op)
> >>  {
> >>    vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> >> -  stmt_vec_info stmt_vinfo = stmts[0];
> >>    unsigned HOST_WIDE_INT nunits;
> >>    unsigned j, number_of_places_left_in_vector;
> >> -  tree vector_type;
> >> +  tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
> >>    unsigned int group_size = stmts.length ();
> >>    unsigned int i;
> >>    class loop *loop;
> >>
> >> -  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
> >> -
> >> -  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
> >> -
> >> -  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
> >> +  loop = (gimple_bb (reduc_info->stmt))->loop_father;
> >>    gcc_assert (loop);
> >>    edge pe = loop_preheader_edge (loop);
> >>
> >> @@ -4823,7 +4819,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
> >>      {
> >>        tree op;
> >>        i = j % group_size;
> >> -      stmt_vinfo = stmts[i];
> >> +      stmt_vec_info stmt_vinfo = stmts[i];
> >>
> >>        /* Get the def before the loop.  In reduction chain we have only
> >>          one initial value.  Else we have as many as PHIs in the group.  */
> >> @@ -7510,7 +7506,8 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
> >>               = neutral_op_for_slp_reduction (slp_node, vectype_out,
> >>                                               STMT_VINFO_REDUC_CODE (reduc_info),
> >>                                               first != NULL);
> >> -         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
> >> +         get_initial_defs_for_reduction (loop_vinfo, reduc_info,
> >> +                                         slp_node_instance->reduc_phis,
> >>                                           &vec_initial_defs, vec_num,
> >>                                           first != NULL, neutral_op);
> >>         }

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 07/10] vect: Pass reduc_info to get_initial_def_for_reduction
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (5 preceding siblings ...)
  2021-07-08 12:40 ` [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction Richard Sandiford
@ 2021-07-08 12:41 ` Richard Sandiford
  2021-07-08 12:41 ` [PATCH 08/10] vect: Generalise neutral_op_for_slp_reduction Richard Sandiford
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:41 UTC (permalink / raw)
  To: gcc-patches

Similarly to the previous patch, this one passes the reduc_info
to get_initial_def_for_reduction, rather than a stmt_vec_info that
lacks the metadata.  This again becomes useful later.

gcc/
	* tree-vect-loop.c (get_initial_def_for_reduction): Take the
	reduc_info instead of the original stmt_vec_info.
	(vect_transform_cycle_phi): Update accordingly.
---
 gcc/tree-vect-loop.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 565c2859477..a67036f92e0 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -4625,7 +4625,7 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 /* Function get_initial_def_for_reduction
 
    Input:
-   STMT_VINFO - a stmt that performs a reduction operation in the loop.
+   REDUC_INFO - the info_for_reduction
    INIT_VAL - the initial value of the reduction variable
 
    Output:
@@ -4667,7 +4667,7 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 
 static tree
 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
-			       stmt_vec_info stmt_vinfo,
+			       stmt_vec_info reduc_info,
 			       enum tree_code code, tree init_val,
                                tree *adjustment_def)
 {
@@ -4685,8 +4685,8 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
 
-  gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
-	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
+  gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
+	      || loop == (gimple_bb (reduc_info->stmt))->loop_father);
 
   /* ADJUSTMENT_DEF is NULL when called from
      vect_create_epilog_for_reduction to vectorize double reduction.  */
@@ -7556,7 +7556,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
 	    adjustment_defp = NULL;
 	  vec_initial_def
-	    = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
+	    = get_initial_def_for_reduction (loop_vinfo, reduc_info, code,
 					     initial_def, adjustment_defp);
 	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
 	  vec_initial_defs.create (ncopies);

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 08/10] vect: Generalise neutral_op_for_slp_reduction
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (6 preceding siblings ...)
  2021-07-08 12:41 ` [PATCH 07/10] vect: Pass reduc_info to get_initial_def_for_reduction Richard Sandiford
@ 2021-07-08 12:41 ` Richard Sandiford
  2021-07-08 13:13   ` Richard Biener
  2021-07-08 12:41 ` [PATCH 09/10] vect: Simplify get_initial_def_for_reduction Richard Sandiford
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:41 UTC (permalink / raw)
  To: gcc-patches

This patch generalises the interface to neutral_op_for_slp_reduction
so that it can be used for non-SLP reductions too.  This isn't much
of a win on its own, but it helps later patches.

gcc/
	* tree-vect-loop.c (neutral_op_for_slp_reduction): Replace with...
	(neutral_op_for_reduction): ...this, providing a more general
	interface.
	(vect_create_epilog_for_reduction): Update accordingly.
	(vectorizable_reduction): Likewise.
	(vect_transform_cycle_phi): Likewise.
---
 gcc/tree-vect-loop.c | 59 +++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index a67036f92e0..744645d8bad 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3248,23 +3248,15 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
     }
 }
 
-/* If there is a neutral value X such that SLP reduction NODE would not
-   be affected by the introduction of additional X elements, return that X,
-   otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
-   is the vector type that would hold element X.  REDUC_CHAIN is true if
-   the SLP statements perform a single reduction, false if each statement
-   performs an independent reduction.  */
+/* If there is a neutral value X such that a reduction would not be affected
+   by the introduction of additional X elements, return that X, otherwise
+   return null.  CODE is the code of the reduction and SCALAR_TYPE is type
+   of the scalar elements.  If the reduction has just a single initial value
+   then INITIAL_VALUE is that value, otherwise it is null.  */
 
 static tree
-neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
-			      tree_code code, bool reduc_chain)
+neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
-  tree scalar_type = TREE_TYPE (vector_type);
-  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
-  gcc_assert (loop);
-
   switch (code)
     {
     case WIDEN_SUM_EXPR:
@@ -3284,12 +3276,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
 
     case MAX_EXPR:
     case MIN_EXPR:
-      /* For MIN/MAX the initial values are neutral.  A reduction chain
-	 has only a single initial value, so that value is neutral for
-	 all statements.  */
-      if (reduc_chain)
-	return vect_phi_initial_value (stmt_vinfo);
-      return NULL_TREE;
+      return initial_value;
 
     default:
       return NULL_TREE;
@@ -5535,10 +5522,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       tree neutral_op = NULL_TREE;
       if (slp_node)
 	{
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
-	  neutral_op
-	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
-					    vectype, code, first != NULL);
+	  tree initial_value = NULL_TREE;
+	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+	    initial_value = vect_phi_initial_value (orig_phis[0]);
+	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
+						 initial_value);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6935,9 +6923,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   if (slp_node)
-    neutral_op = neutral_op_for_slp_reduction
-      (slp_node_instance->reduc_phis, vectype_out, orig_code,
-       REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+    {
+      tree initial_value = NULL_TREE;
+      if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
+	initial_value = vect_phi_initial_value (reduc_def_phi);
+      neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					     orig_code, initial_value);
+    }
 
   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -7501,15 +7493,16 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       else
 	{
 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-	  tree neutral_op
-	      = neutral_op_for_slp_reduction (slp_node, vectype_out,
-					      STMT_VINFO_REDUC_CODE (reduc_info),
-					      first != NULL);
+	  tree initial_value = NULL_TREE;
+	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
+	    initial_value = vect_phi_initial_value (phi);
+	  tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	  tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
+						      code, initial_value);
 	  get_initial_defs_for_reduction (loop_vinfo, reduc_info,
 					  slp_node_instance->reduc_phis,
 					  &vec_initial_defs, vec_num,
-					  first != NULL, neutral_op);
+					  initial_value != NULL, neutral_op);
 	}
     }
   else

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 08/10] vect: Generalise neutral_op_for_slp_reduction
  2021-07-08 12:41 ` [PATCH 08/10] vect: Generalise neutral_op_for_slp_reduction Richard Sandiford
@ 2021-07-08 13:13   ` Richard Biener
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Biener @ 2021-07-08 13:13 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:48 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This patch generalises the interface to neutral_op_for_slp_reduction
> so that it can be used for non-SLP reductions too.  This isn't much
> of a win on its own, but it helps later patches.

I guess that makes sense - OK.

Richard.

> gcc/
>         * tree-vect-loop.c (neutral_op_for_slp_reduction): Replace with...
>         (neutral_op_for_reduction): ...this, providing a more general
>         interface.
>         (vect_create_epilog_for_reduction): Update accordingly.
>         (vectorizable_reduction): Likewise.
>         (vect_transform_cycle_phi): Likewise.
> ---
>  gcc/tree-vect-loop.c | 59 +++++++++++++++++++-------------------------
>  1 file changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index a67036f92e0..744645d8bad 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -3248,23 +3248,15 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
>      }
>  }
>
> -/* If there is a neutral value X such that SLP reduction NODE would not
> -   be affected by the introduction of additional X elements, return that X,
> -   otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
> -   is the vector type that would hold element X.  REDUC_CHAIN is true if
> -   the SLP statements perform a single reduction, false if each statement
> -   performs an independent reduction.  */
> +/* If there is a neutral value X such that a reduction would not be affected
> +   by the introduction of additional X elements, return that X, otherwise
> +   return null.  CODE is the code of the reduction and SCALAR_TYPE is type
> +   of the scalar elements.  If the reduction has just a single initial value
> +   then INITIAL_VALUE is that value, otherwise it is null.  */
>
>  static tree
> -neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
> -                             tree_code code, bool reduc_chain)
> +neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
>  {
> -  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> -  stmt_vec_info stmt_vinfo = stmts[0];
> -  tree scalar_type = TREE_TYPE (vector_type);
> -  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
> -  gcc_assert (loop);
> -
>    switch (code)
>      {
>      case WIDEN_SUM_EXPR:
> @@ -3284,12 +3276,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
>
>      case MAX_EXPR:
>      case MIN_EXPR:
> -      /* For MIN/MAX the initial values are neutral.  A reduction chain
> -        has only a single initial value, so that value is neutral for
> -        all statements.  */
> -      if (reduc_chain)
> -       return vect_phi_initial_value (stmt_vinfo);
> -      return NULL_TREE;
> +      return initial_value;
>
>      default:
>        return NULL_TREE;
> @@ -5535,10 +5522,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        tree neutral_op = NULL_TREE;
>        if (slp_node)
>         {
> -         stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
> -         neutral_op
> -           = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
> -                                           vectype, code, first != NULL);
> +         tree initial_value = NULL_TREE;
> +         if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
> +           initial_value = vect_phi_initial_value (orig_phis[0]);
> +         neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> +                                                initial_value);
>         }
>        if (neutral_op)
>         vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6935,9 +6923,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    /* For SLP reductions, see if there is a neutral value we can use.  */
>    tree neutral_op = NULL_TREE;
>    if (slp_node)
> -    neutral_op = neutral_op_for_slp_reduction
> -      (slp_node_instance->reduc_phis, vectype_out, orig_code,
> -       REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
> +    {
> +      tree initial_value = NULL_TREE;
> +      if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
> +       initial_value = vect_phi_initial_value (reduc_def_phi);
> +      neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
> +                                            orig_code, initial_value);
> +    }
>
>    if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
>      {
> @@ -7501,15 +7493,16 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>        else
>         {
>           gcc_assert (slp_node == slp_node_instance->reduc_phis);
> -         stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
> -         tree neutral_op
> -             = neutral_op_for_slp_reduction (slp_node, vectype_out,
> -                                             STMT_VINFO_REDUC_CODE (reduc_info),
> -                                             first != NULL);
> +         tree initial_value = NULL_TREE;
> +         if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
> +           initial_value = vect_phi_initial_value (phi);
> +         tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> +         tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
> +                                                     code, initial_value);
>           get_initial_defs_for_reduction (loop_vinfo, reduc_info,
>                                           slp_node_instance->reduc_phis,
>                                           &vec_initial_defs, vec_num,
> -                                         first != NULL, neutral_op);
> +                                         initial_value != NULL, neutral_op);
>         }
>      }
>    else

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 09/10] vect: Simplify get_initial_def_for_reduction
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (7 preceding siblings ...)
  2021-07-08 12:41 ` [PATCH 08/10] vect: Generalise neutral_op_for_slp_reduction Richard Sandiford
@ 2021-07-08 12:41 ` Richard Sandiford
  2021-07-08 13:14   ` Richard Biener
  2021-07-08 12:43 ` [PATCH 10/10] vect: Reuse reduction accumulators between loops Richard Sandiford
  2021-07-10  2:11 ` [PATCH 00/10] " Kewen.Lin
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:41 UTC (permalink / raw)
  To: gcc-patches

After previous patches, we can now easily provide the neutral op
as an argument to get_initial_def_for_reduction.  This in turn
allows the adjustment calculation to be moved outside of
get_initial_def_for_reduction, which is the main motivation
of the patch.

gcc/
	* tree-vect-loop.c (get_initial_def_for_reduction): Remove
	adjustment handling.  Take the neutral value as an argument,
	in place of the code argument.
	(vect_transform_cycle_phi): Update accordingly.  Handle the
	initial values of cond reductions separately from code reductions.
	Choose the adjustment here rather than in
	get_initial_def_for_reduction.  Sink the splat of vec_initial_def.
---
 gcc/tree-vect-loop.c | 177 +++++++++++++++----------------------------
 1 file changed, 59 insertions(+), 118 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 744645d8bad..fe7e73f655f 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -4614,57 +4614,26 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
    Input:
    REDUC_INFO - the info_for_reduction
    INIT_VAL - the initial value of the reduction variable
+   NEUTRAL_OP - a value that has no effect on the reduction, as per
+		neutral_op_for_reduction
 
    Output:
-   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
-        of the reduction (used for adjusting the epilog - see below).
    Return a vector variable, initialized according to the operation that
 	STMT_VINFO performs. This vector will be used as the initial value
 	of the vector of partial results.
 
-   Option1 (adjust in epilog): Initialize the vector as follows:
-     add/bit or/xor:    [0,0,...,0,0]
-     mult/bit and:      [1,1,...,1,1]
-     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
-   and when necessary (e.g. add/mult case) let the caller know
-   that it needs to adjust the result by init_val.
-
-   Option2: Initialize the vector as follows:
-     add/bit or/xor:    [init_val,0,0,...,0]
-     mult/bit and:      [init_val,1,1,...,1]
-     min/max/cond_expr: [init_val,init_val,...,init_val]
-   and no adjustments are needed.
-
-   For example, for the following code:
-
-   s = init_val;
-   for (i=0;i<n;i++)
-     s = s + a[i];
-
-   STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
-   For a vector of 4 units, we want to return either [0,0,0,init_val],
-   or [0,0,0,0] and let the caller know that it needs to adjust
-   the result at the end by 'init_val'.
-
-   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
-   initialization vector is simpler (same element in all entries), if
-   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
-
-   A cost model should help decide between these two schemes.  */
+   The value we need is a vector in which element 0 has value INIT_VAL
+   and every other element has value NEUTRAL_OP.  */
 
 static tree
 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
 			       stmt_vec_info reduc_info,
-			       enum tree_code code, tree init_val,
-                               tree *adjustment_def)
+			       tree init_val, tree neutral_op)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
-  tree def_for_init;
   tree init_def;
-  REAL_VALUE_TYPE real_init_val = dconst0;
-  int int_init_val = 0;
   gimple_seq stmts = NULL;
 
   gcc_assert (vectype);
@@ -4675,75 +4644,34 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
 	      || loop == (gimple_bb (reduc_info->stmt))->loop_father);
 
-  /* ADJUSTMENT_DEF is NULL when called from
-     vect_create_epilog_for_reduction to vectorize double reduction.  */
-  if (adjustment_def)
-    *adjustment_def = NULL;
-
-  switch (code)
+  if (operand_equal_p (init_val, neutral_op))
     {
-    case WIDEN_SUM_EXPR:
-    case DOT_PROD_EXPR:
-    case SAD_EXPR:
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-    case BIT_IOR_EXPR:
-    case BIT_XOR_EXPR:
-    case MULT_EXPR:
-    case BIT_AND_EXPR:
-      {
-        if (code == MULT_EXPR)
-          {
-            real_init_val = dconst1;
-            int_init_val = 1;
-          }
-
-        if (code == BIT_AND_EXPR)
-          int_init_val = -1;
-
-        if (SCALAR_FLOAT_TYPE_P (scalar_type))
-          def_for_init = build_real (scalar_type, real_init_val);
-        else
-          def_for_init = build_int_cst (scalar_type, int_init_val);
-
-	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
-	  {
-	    /* Option1: the first element is '0' or '1' as well.  */
-	    if (!operand_equal_p (def_for_init, init_val, 0))
-	      *adjustment_def = init_val;
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	  }
-	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
-	  {
-	    /* Option2 (variable length): the first element is INIT_VAL.  */
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
-				     vectype, init_def, init_val);
-	  }
-	else
-	  {
-	    /* Option2: the first element is INIT_VAL.  */
-	    tree_vector_builder elts (vectype, 1, 2);
-	    elts.quick_push (init_val);
-	    elts.quick_push (def_for_init);
-	    init_def = gimple_build_vector (&stmts, &elts);
-	  }
-      }
-      break;
-
-    case MIN_EXPR:
-    case MAX_EXPR:
-    case COND_EXPR:
-      {
-	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
-	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
-      }
-      break;
-
-    default:
-      gcc_unreachable ();
+      /* If both elements are equal then the vector described above is
+	 just a splat.  */
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
+    }
+  else
+    {
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
+	{
+	  /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
+	     element 0.  */
+	  init_def = gimple_build_vector_from_val (&stmts, vectype,
+						   neutral_op);
+	  init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
+				   vectype, init_def, init_val);
+	}
+      else
+	{
+	  /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
+	  tree_vector_builder elts (vectype, 1, 2);
+	  elts.quick_push (init_val);
+	  elts.quick_push (neutral_op);
+	  init_def = gimple_build_vector (&stmts, &elts);
+	}
     }
 
   if (stmts)
@@ -7479,7 +7407,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					       vectype_out);
 
   /* Get the loop-entry arguments.  */
-  tree vec_initial_def;
+  tree vec_initial_def = NULL_TREE;
   auto_vec<tree> vec_initial_defs;
   if (slp_node)
     {
@@ -7529,9 +7457,6 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
 	    }
 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
 	}
       else if (nested_cycle)
 	{
@@ -7541,23 +7466,39 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					 ncopies, initial_def,
 					 &vec_initial_defs);
 	}
+      else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
+	       || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
+	/* Fill the initial vector with the initial scalar value.  */
+	vec_initial_def
+	  = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
+					   initial_def, initial_def);
       else
 	{
-	  tree adjustment_def = NULL_TREE;
-	  tree *adjustment_defp = &adjustment_def;
 	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
-	    adjustment_defp = NULL;
+	  tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def),
+						      code, initial_def);
+	  gcc_assert (neutral_op);
+	  /* Try to simplify the vector initialization by applying an
+	     adjustment after the reduction has been performed.  */
+	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+	      && !operand_equal_p (neutral_op, initial_def))
+	    {
+	      STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def;
+	      initial_def = neutral_op;
+	    }
 	  vec_initial_def
-	    = get_initial_def_for_reduction (loop_vinfo, reduc_info, code,
-					     initial_def, adjustment_defp);
-	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
+	    = get_initial_def_for_reduction (loop_vinfo, reduc_info,
+					     initial_def, neutral_op);
 	}
     }
 
+  if (vec_initial_def)
+    {
+      vec_initial_defs.create (ncopies);
+      for (i = 0; i < ncopies; ++i)
+	vec_initial_defs.quick_push (vec_initial_def);
+    }
+
   /* Generate the reduction PHIs upfront.  */
   for (i = 0; i < vec_num; i++)
     {

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 09/10] vect: Simplify get_initial_def_for_reduction
  2021-07-08 12:41 ` [PATCH 09/10] vect: Simplify get_initial_def_for_reduction Richard Sandiford
@ 2021-07-08 13:14   ` Richard Biener
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Biener @ 2021-07-08 13:14 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:49 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> After previous patches, we can now easily provide the neutral op
> as an argument to get_initial_def_for_reduction.  This in turn
> allows the adjustment calculation to be moved outside of
> get_initial_def_for_reduction, which is the main motivation
> of the patch.

OK.

> gcc/
>         * tree-vect-loop.c (get_initial_def_for_reduction): Remove
>         adjustment handling.  Take the neutral value as an argument,
>         in place of the code argument.
>         (vect_transform_cycle_phi): Update accordingly.  Handle the
>         initial values of cond reductions separately from code reductions.
>         Choose the adjustment here rather than in
>         get_initial_def_for_reduction.  Sink the splat of vec_initial_def.
> ---
>  gcc/tree-vect-loop.c | 177 +++++++++++++++----------------------------
>  1 file changed, 59 insertions(+), 118 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 744645d8bad..fe7e73f655f 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -4614,57 +4614,26 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>     Input:
>     REDUC_INFO - the info_for_reduction
>     INIT_VAL - the initial value of the reduction variable
> +   NEUTRAL_OP - a value that has no effect on the reduction, as per
> +               neutral_op_for_reduction
>
>     Output:
> -   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
> -        of the reduction (used for adjusting the epilog - see below).
>     Return a vector variable, initialized according to the operation that
>         STMT_VINFO performs. This vector will be used as the initial value
>         of the vector of partial results.
>
> -   Option1 (adjust in epilog): Initialize the vector as follows:
> -     add/bit or/xor:    [0,0,...,0,0]
> -     mult/bit and:      [1,1,...,1,1]
> -     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
> -   and when necessary (e.g. add/mult case) let the caller know
> -   that it needs to adjust the result by init_val.
> -
> -   Option2: Initialize the vector as follows:
> -     add/bit or/xor:    [init_val,0,0,...,0]
> -     mult/bit and:      [init_val,1,1,...,1]
> -     min/max/cond_expr: [init_val,init_val,...,init_val]
> -   and no adjustments are needed.
> -
> -   For example, for the following code:
> -
> -   s = init_val;
> -   for (i=0;i<n;i++)
> -     s = s + a[i];
> -
> -   STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
> -   For a vector of 4 units, we want to return either [0,0,0,init_val],
> -   or [0,0,0,0] and let the caller know that it needs to adjust
> -   the result at the end by 'init_val'.
> -
> -   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
> -   initialization vector is simpler (same element in all entries), if
> -   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
> -
> -   A cost model should help decide between these two schemes.  */
> +   The value we need is a vector in which element 0 has value INIT_VAL
> +   and every other element has value NEUTRAL_OP.  */
>
>  static tree
>  get_initial_def_for_reduction (loop_vec_info loop_vinfo,
>                                stmt_vec_info reduc_info,
> -                              enum tree_code code, tree init_val,
> -                               tree *adjustment_def)
> +                              tree init_val, tree neutral_op)
>  {
>    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>    tree scalar_type = TREE_TYPE (init_val);
>    tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> -  tree def_for_init;
>    tree init_def;
> -  REAL_VALUE_TYPE real_init_val = dconst0;
> -  int int_init_val = 0;
>    gimple_seq stmts = NULL;
>
>    gcc_assert (vectype);
> @@ -4675,75 +4644,34 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
>    gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
>               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
>
> -  /* ADJUSTMENT_DEF is NULL when called from
> -     vect_create_epilog_for_reduction to vectorize double reduction.  */
> -  if (adjustment_def)
> -    *adjustment_def = NULL;
> -
> -  switch (code)
> +  if (operand_equal_p (init_val, neutral_op))
>      {
> -    case WIDEN_SUM_EXPR:
> -    case DOT_PROD_EXPR:
> -    case SAD_EXPR:
> -    case PLUS_EXPR:
> -    case MINUS_EXPR:
> -    case BIT_IOR_EXPR:
> -    case BIT_XOR_EXPR:
> -    case MULT_EXPR:
> -    case BIT_AND_EXPR:
> -      {
> -        if (code == MULT_EXPR)
> -          {
> -            real_init_val = dconst1;
> -            int_init_val = 1;
> -          }
> -
> -        if (code == BIT_AND_EXPR)
> -          int_init_val = -1;
> -
> -        if (SCALAR_FLOAT_TYPE_P (scalar_type))
> -          def_for_init = build_real (scalar_type, real_init_val);
> -        else
> -          def_for_init = build_int_cst (scalar_type, int_init_val);
> -
> -       if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
> -         {
> -           /* Option1: the first element is '0' or '1' as well.  */
> -           if (!operand_equal_p (def_for_init, init_val, 0))
> -             *adjustment_def = init_val;
> -           init_def = gimple_build_vector_from_val (&stmts, vectype,
> -                                                    def_for_init);
> -         }
> -       else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
> -         {
> -           /* Option2 (variable length): the first element is INIT_VAL.  */
> -           init_def = gimple_build_vector_from_val (&stmts, vectype,
> -                                                    def_for_init);
> -           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
> -                                    vectype, init_def, init_val);
> -         }
> -       else
> -         {
> -           /* Option2: the first element is INIT_VAL.  */
> -           tree_vector_builder elts (vectype, 1, 2);
> -           elts.quick_push (init_val);
> -           elts.quick_push (def_for_init);
> -           init_def = gimple_build_vector (&stmts, &elts);
> -         }
> -      }
> -      break;
> -
> -    case MIN_EXPR:
> -    case MAX_EXPR:
> -    case COND_EXPR:
> -      {
> -       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
> -       init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
> -      }
> -      break;
> -
> -    default:
> -      gcc_unreachable ();
> +      /* If both elements are equal then the vector described above is
> +        just a splat.  */
> +      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
> +      init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
> +    }
> +  else
> +    {
> +      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
> +      init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
> +      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
> +       {
> +         /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
> +            element 0.  */
> +         init_def = gimple_build_vector_from_val (&stmts, vectype,
> +                                                  neutral_op);
> +         init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
> +                                  vectype, init_def, init_val);
> +       }
> +      else
> +       {
> +         /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
> +         tree_vector_builder elts (vectype, 1, 2);
> +         elts.quick_push (init_val);
> +         elts.quick_push (neutral_op);
> +         init_def = gimple_build_vector (&stmts, &elts);
> +       }
>      }
>
>    if (stmts)
> @@ -7479,7 +7407,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>                                                vectype_out);
>
>    /* Get the loop-entry arguments.  */
> -  tree vec_initial_def;
> +  tree vec_initial_def = NULL_TREE;
>    auto_vec<tree> vec_initial_defs;
>    if (slp_node)
>      {
> @@ -7529,9 +7457,6 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
>             }
>           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
> -         vec_initial_defs.create (ncopies);
> -         for (i = 0; i < ncopies; ++i)
> -           vec_initial_defs.quick_push (vec_initial_def);
>         }
>        else if (nested_cycle)
>         {
> @@ -7541,23 +7466,39 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>                                          ncopies, initial_def,
>                                          &vec_initial_defs);
>         }
> +      else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
> +              || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
> +       /* Fill the initial vector with the initial scalar value.  */
> +       vec_initial_def
> +         = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
> +                                          initial_def, initial_def);
>        else
>         {
> -         tree adjustment_def = NULL_TREE;
> -         tree *adjustment_defp = &adjustment_def;
>           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> -         if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
> -           adjustment_defp = NULL;
> +         tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def),
> +                                                     code, initial_def);
> +         gcc_assert (neutral_op);
> +         /* Try to simplify the vector initialization by applying an
> +            adjustment after the reduction has been performed.  */
> +         if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
> +             && !operand_equal_p (neutral_op, initial_def))
> +           {
> +             STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def;
> +             initial_def = neutral_op;
> +           }
>           vec_initial_def
> -           = get_initial_def_for_reduction (loop_vinfo, reduc_info, code,
> -                                            initial_def, adjustment_defp);
> -         STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
> -         vec_initial_defs.create (ncopies);
> -         for (i = 0; i < ncopies; ++i)
> -           vec_initial_defs.quick_push (vec_initial_def);
> +           = get_initial_def_for_reduction (loop_vinfo, reduc_info,
> +                                            initial_def, neutral_op);
>         }
>      }
>
> +  if (vec_initial_def)
> +    {
> +      vec_initial_defs.create (ncopies);
> +      for (i = 0; i < ncopies; ++i)
> +       vec_initial_defs.quick_push (vec_initial_def);
> +    }
> +
>    /* Generate the reduction PHIs upfront.  */
>    for (i = 0; i < vec_num; i++)
>      {

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 10/10] vect: Reuse reduction accumulators between loops
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (8 preceding siblings ...)
  2021-07-08 12:41 ` [PATCH 09/10] vect: Simplify get_initial_def_for_reduction Richard Sandiford
@ 2021-07-08 12:43 ` Richard Sandiford
  2021-07-09 11:58   ` Richard Biener
  2021-07-10  2:11 ` [PATCH 00/10] " Kewen.Lin
  10 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-08 12:43 UTC (permalink / raw)
  To: gcc-patches

This patch adds support for reusing a main loop's reduction accumulator
in an epilogue loop.  This in turn lets the loops share a single piece
of vector->scalar reduction code.

The patch has the following restrictions:

(1) The epilogue reduction can only operate on a single vector
    (e.g. ncopies must be 1 for non-SLP reductions, and the group size
    must be <= the element count for SLP reductions).

(2) Both loops must use the same vector mode for their accumulators.
    This means that the patch is restricted to targets that support
    --param vect-partial-vector-usage=1.

(3) The reduction must be a standard “tree code” reduction.

However, these restrictions could be lifted in future.  For example,
if the main loop operates on 128-bit vectors and the epilogue loop
operates on 64-bit vectors, we could in future reduce the 128-bit
vector by one stage and use the 64-bit result as the starting point
for the epilogue result.

The patch tries to handle chained SLP reductions, unchained SLP
reductions and non-SLP reductions.  It also handles cases in which
the epilogue loop is entered directly (rather than via the main loop)
and cases in which the epilogue loop can be skipped.

vect_get_main_loop_result is a bit more general than the current
patch needs.

gcc/
	* tree-vectorizer.h (vect_reusable_accumulator): New structure.
	(_loop_vec_info::main_loop_edge): New field.
	(_loop_vec_info::skip_main_loop_edge): Likewise.
	(_loop_vec_info::skip_this_loop_edge): Likewise.
	(_loop_vec_info::reusable_accumulators): Likewise.
	(_stmt_vec_info::reduc_scalar_results): Likewise.
	(_stmt_vec_info::reused_accumulator): Likewise.
	(vect_get_main_loop_result): Declare.
	* tree-vectorizer.c (vec_info::new_stmt_vec_info): Initialize
	reduc_scalar_inputs.
	(vec_info::free_stmt_vec_info): Free reduc_scalar_inputs.
	* tree-vect-loop-manip.c (vect_get_main_loop_result): New function.
	(vect_do_peeling): Fill an epilogue loop's main_loop_edge,
	skip_main_loop_edge and skip_this_loop_edge fields.
	* tree-vect-loop.c (INCLUDE_ALGORITHM): Define.
	(vect_emit_reduction_init_stmts): New function.
	(get_initial_def_for_reduction): Use it.
	(get_initial_defs_for_reduction): Likewise.  Change the vinfo
	parameter to a loop_vec_info.
	(vect_create_epilog_for_reduction): Store the scalar results
	in the reduc_info.  If an epilogue loop is reusing an accumulator
	from the main loop, and if the epilogue loop can also be skipped,
	try to place the reduction code in the join block.  Record
	accumulators that could potentially be reused by epilogue loops.
	(vect_transform_cycle_phi): When vectorizing epilogue loops,
	try to reuse accumulators from the main loop.  Record the initial
	value in reduc_info for non-SLP reductions too.

gcc/testsuite/
	* gcc.target/aarch64/sve/reduc_9.c: New test.
	* gcc.target/aarch64/sve/reduc_9_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_10.c: Likewise.
	* gcc.target/aarch64/sve/reduc_10_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_11.c: Likewise.
	* gcc.target/aarch64/sve/reduc_11_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_12.c: Likewise.
	* gcc.target/aarch64/sve/reduc_12_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_13.c: Likewise.
	* gcc.target/aarch64/sve/reduc_13_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_14.c: Likewise.
	* gcc.target/aarch64/sve/reduc_14_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_15.c: Likewise.
	* gcc.target/aarch64/sve/reduc_15_run.c: Likewise.
---
 .../gcc.target/aarch64/sve/reduc_10.c         |  77 +++++
 .../gcc.target/aarch64/sve/reduc_10_run.c     |  49 +++
 .../gcc.target/aarch64/sve/reduc_11.c         |  71 ++++
 .../gcc.target/aarch64/sve/reduc_11_run.c     |  34 ++
 .../gcc.target/aarch64/sve/reduc_12.c         |  71 ++++
 .../gcc.target/aarch64/sve/reduc_12_run.c     |  66 ++++
 .../gcc.target/aarch64/sve/reduc_13.c         | 101 ++++++
 .../gcc.target/aarch64/sve/reduc_13_run.c     |  61 ++++
 .../gcc.target/aarch64/sve/reduc_14.c         | 107 ++++++
 .../gcc.target/aarch64/sve/reduc_14_run.c     | 187 +++++++++++
 .../gcc.target/aarch64/sve/reduc_15.c         |  16 +
 .../gcc.target/aarch64/sve/reduc_15_run.c     |  22 ++
 .../gcc.target/aarch64/sve/reduc_9.c          |  77 +++++
 .../gcc.target/aarch64/sve/reduc_9_run.c      |  29 ++
 gcc/tree-vect-loop-manip.c                    |  29 ++
 gcc/tree-vect-loop.c                          | 309 ++++++++++++++----
 gcc/tree-vectorizer.c                         |   4 +
 gcc/tree-vectorizer.h                         |  51 ++-
 18 files changed, 1297 insertions(+), 64 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c

diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e2fd3609fee..ed7a7738880 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens;
 
 typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
 
+/* Information about a reduction accumulator from the main loop that could
+   conceivably be reused as the input to a reduction in an epilogue loop.  */
+struct vect_reusable_accumulator {
+  /* The final value of the accumulator, which forms the input to the
+     reduction operation.  */
+  tree reduc_input;
+
+  /* The stmt_vec_info that describes the reduction (i.e. the one for
+     which is_reduc_info is true).  */
+  stmt_vec_info reduc_info;
+};
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -588,6 +600,23 @@ public:
   /* Unrolling factor  */
   poly_uint64 vectorization_factor;
 
+  /* If this loop is an epilogue loop whose main loop can be skipped,
+     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
+     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
+     main loop and goes straight to this loop's preheader.
+
+     Both fields are null otherwise.  */
+  edge main_loop_edge;
+  edge skip_main_loop_edge;
+
+  /* If this loop is an epilogue loop that might be skipped after executing
+     the main loop, this edge is the one that skips the epilogue.  */
+  edge skip_this_loop_edge;
+
+  /* After vectorization, maps live-out SSA names to information about
+     the reductions that generated them.  */
+  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
+
   /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1186,6 +1215,21 @@ public:
   /* The vector type for performing the actual reduction.  */
   tree reduc_vectype;
 
+  /* If IS_REDUC_INFO is true and if the reduction is operating on N
+     elements in parallel, this vector gives the initial values of these
+     N elements.  */
+  vec<tree> reduc_initial_values;
+
+  /* If IS_REDUC_INFO is true and if the reduction is operating on N
+     elements in parallel, this vector gives the scalar result of each
+     reduction.  */
+  vec<tree> reduc_scalar_results;
+
+  /* Only meaningful if IS_REDUC_INFO.  If non-null, the reduction is
+     being performed by an epilogue loop and we have decided to reuse
+     this accumulator from the main loop.  */
+  vect_reusable_accumulator *reused_accumulator;
+
   /* Whether we force a single cycle PHI during reduction vectorization.  */
   bool force_single_cycle;
 
@@ -1382,12 +1426,6 @@ vect_phi_initial_value (gphi *phi)
   return PHI_ARG_DEF_FROM_EDGE (phi, pe);
 }
 
-static inline tree
-vect_phi_initial_value (stmt_vec_info stmt_info)
-{
-  return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
-}
-
 /* Return true if STMT_INFO should produce a vector mask type rather than
    a normal nonmask type.  */
 
@@ -1818,6 +1856,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
 				    tree *);
+extern tree vect_get_main_loop_result (loop_vec_info, tree, tree = NULL_TREE);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 9748043f3ee..f1035a83826 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt)
   STMT_VINFO_SLP_VECT_ONLY (res) = false;
   STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false;
   STMT_VINFO_VEC_STMTS (res) = vNULL;
+  res->reduc_initial_values = vNULL;
+  res->reduc_scalar_results = vNULL;
 
   if (is_a <loop_vec_info> (this)
       && gimple_code (stmt) == GIMPLE_PHI
@@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
 	release_ssa_name (lhs);
     }
 
+  stmt_info->reduc_initial_values.release ();
+  stmt_info->reduc_scalar_results.release ();
   STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release ();
   STMT_VINFO_VEC_STMTS (stmt_info).release ();
   free (stmt_info);
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 2909e8a0fc3..b7b0523e3c8 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
 }
 
+/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
+   from the corresponding main loop.  Return a value that is available in
+   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
+   Passing a null SKIP_VALUE is equivalent to passing zero.  */
+
+tree
+vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
+			   tree skip_value)
+{
+  if (!loop_vinfo->main_loop_edge)
+    return main_loop_value;
+
+  if (!skip_value)
+    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
+
+  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
+  basic_block bb = loop_vinfo->main_loop_edge->dest;
+  gphi *new_phi = create_phi_node (phi_result, bb);
+  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
+	       UNKNOWN_LOCATION);
+  add_phi_arg (new_phi, skip_value,
+	       loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
+  return phi_result;
+}
+
 /* Function vect_do_peeling.
 
    Input:
@@ -2986,6 +3011,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 					   skip_vector ? anchor : guard_bb,
 					   prob_epilog.invert (),
 					   irred_flag);
+	  if (vect_epilogues)
+	    epilogue_vinfo->skip_this_loop_edge = guard_e;
 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
 					      single_exit (epilog));
 	  /* Only need to handle basic block before epilog loop if it's not
@@ -3057,6 +3084,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
 		       UNKNOWN_LOCATION);
 	  niters = PHI_RESULT (new_phi);
+	  epilogue_vinfo->main_loop_edge = update_e;
+	  epilogue_vinfo->skip_main_loop_edge = skip_e;
 	}
 
       /* Set ADVANCE to the number of iterations performed by the previous
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fe7e73f655f..5e6c9b7c38a 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     th (0),
     versioning_threshold (0),
     vectorization_factor (0),
+    main_loop_edge (nullptr),
+    skip_main_loop_edge (nullptr),
+    skip_this_loop_edge (nullptr),
+    reusable_accumulators (),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
@@ -4607,7 +4612,32 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
                  prologue_cost, epilogue_cost);
 }
 
+/* SEQ is a sequence of instructions that initialize the reduction
+   described by REDUC_INFO.  Emit them in the appropriate place.  */
 
+static void
+vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info, gimple *seq)
+{
+  if (reduc_info->reused_accumulator)
+    {
+      /* When reusing an accumulator from the main loop, we only need
+	 initialization instructions if the main loop can be skipped.
+	 In that case, emit the initialization instructions at the end
+	 of the guard block that does the skip.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      gcc_assert (skip_edge);
+      gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
+      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+    }
+  else
+    {
+      /* The normal case: emit the initialization instructions on the
+	 preheader edge.  */
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
+    }
+}
 
 /* Function get_initial_def_for_reduction
 
@@ -4675,36 +4705,30 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
     }
 
   if (stmts)
-    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
   return init_def;
 }
 
-/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
-   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
-   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
-   that value will not change the result.  */
+/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
+   which performs a reduction involving GROUP_SIZE scalar statements.
+   NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
+   is nonnull, introducing extra elements of that value will not change the
+   result.  */
 
 static void
-get_initial_defs_for_reduction (vec_info *vinfo,
+get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
 				stmt_vec_info reduc_info,
-				slp_tree slp_node,
 				vec<tree> *vec_oprnds,
 				unsigned int number_of_vectors,
-				bool reduc_chain, tree neutral_op)
+				unsigned int group_size, tree neutral_op)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+  vec<tree> &initial_values = reduc_info->reduc_initial_values;
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
-  unsigned int group_size = stmts.length ();
   unsigned int i;
-  class loop *loop;
-
-  loop = (gimple_bb (reduc_info->stmt))->loop_father;
-  gcc_assert (loop);
-  edge pe = loop_preheader_edge (loop);
 
-  gcc_assert (!reduc_chain || neutral_op);
+  gcc_assert (group_size == initial_values.length () || neutral_op);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -4734,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
     {
       tree op;
       i = j % group_size;
-      stmt_vec_info stmt_vinfo = stmts[i];
 
       /* Get the def before the loop.  In reduction chain we have only
 	 one initial value.  Else we have as many as PHIs in the group.  */
-      if (reduc_chain)
-	op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
-      else if (((vec_oprnds->length () + 1) * nunits
-		- number_of_places_left_in_vector >= group_size)
-	       && neutral_op)
+      if (i >= initial_values.length () || (j > i && neutral_op))
 	op = neutral_op;
       else
-	op = vect_phi_initial_value (stmt_vinfo);
+	op = initial_values[i];
 
       /* Create 'vect_ = {op0,op1,...,opn}'.  */
       number_of_places_left_in_vector--;
@@ -4781,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	    {
 	      /* First time round, duplicate ELTS to fill the
 		 required number of vectors.  */
-	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
-					number_of_vectors, *vec_oprnds);
+	      duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
+					elts, number_of_vectors, *vec_oprnds);
 	      break;
 	    }
 	  vec_oprnds->quick_push (init);
@@ -4794,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	}
     }
   if (ctor_seq != NULL)
-    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
 }
 
 /* For a statement STMT_INFO taking part in a reduction operation return
@@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
   return stmt_info;
 }
 
+/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
+   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
+   matching reduction that we can build on.  Adjust REDUC_INFO and return true
+   if so, otherwise return false.  */
+
+static bool
+vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info)
+{
+  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+  if (!main_loop_vinfo)
+    return false;
+
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
+    return false;
+
+  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
+  auto_vec<tree, 16> main_loop_results (num_phis);
+  auto_vec<tree, 16> initial_values (num_phis);
+  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
+    {
+      /* The epilogue loop can be entered either from the main loop or
+	 from an earlier guard block.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      for (tree incoming_value : reduc_info->reduc_initial_values)
+	{
+	  /* Look for:
+
+	       INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
+				    INITIAL_VALUE(guard block)>.  */
+	  gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
+
+	  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
+	  gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
+
+	  tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
+	  tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
+
+	  main_loop_results.quick_push (from_main_loop);
+	  initial_values.quick_push (from_skip);
+	}
+    }
+  else
+    /* The main loop dominates the epilogue loop.  */
+    main_loop_results.splice (reduc_info->reduc_initial_values);
+
+  /* See if the main loop has the kind of accumulator we need.  */
+  vect_reusable_accumulator *accumulator
+    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
+  if (!accumulator
+      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
+      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
+		      accumulator->reduc_info->reduc_scalar_results.begin ()))
+    return false;
+
+  /* For now, only handle the case in which both loops are operating on the
+     same vector types.  In future we could reduce wider vectors to narrower
+     ones as well.  */
+  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
+  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
+  if (!useless_type_conversion_p (old_vectype, vectype))
+    return false;
+
+  /* Non-SLP reductions might apply an adjustment after the reduction
+     operation, in order to simplify the initialization of the accumulator.
+     If the epilogue loop carries on from where the main loop left off,
+     it should apply the same adjustment to the final reduction result.
+
+     If the epilogue loop can also be entered directly (rather than via
+     the main loop), we need to be able to handle that case in the same way,
+     with the same adjustment.  (In principle we could add a PHI node
+     to select the correct adjustment, but in practice that shouldn't be
+     necessary.)  */
+  tree main_adjustment
+    = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
+  if (loop_vinfo->main_loop_edge && main_adjustment)
+    {
+      gcc_assert (num_phis == 1);
+      tree initial_value = initial_values[0];
+      /* Check that we can use INITIAL_VALUE as the adjustment and
+	 initialize the accumulator with a neutral value instead.  */
+      if (!operand_equal_p (initial_value, main_adjustment))
+	return false;
+      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+      initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
+						    code, initial_value);
+    }
+  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
+  reduc_info->reduc_initial_values.truncate (0);
+  reduc_info->reduc_initial_values.splice (initial_values);
+  reduc_info->reused_accumulator = accumulator;
+  return true;
+}
+
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -4915,7 +5028,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   gimple *use_stmt;
   auto_vec<tree> reduc_inputs;
   int j, i;
-  auto_vec<tree> scalar_results;
+  vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
   unsigned int group_size = 1, k;
   auto_vec<gimple *> phis;
   /* SLP reduction without reduction chain, e.g.,
@@ -4941,16 +5054,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   gcc_assert (vectype);
   mode = TYPE_MODE (vectype);
 
-  tree initial_def = NULL;
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
     ;
   else
     {
-      /* Get at the scalar def before the loop, that defines the initial value
-	 of the reduction variable.  */
-      initial_def = vect_phi_initial_value (reduc_def_stmt);
       /* Optimize: for induction condition reduction, if we can't use zero
          for induc_val, use initial_def.  */
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
@@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       reduc_inputs.safe_push (single_input);
     }
 
+  tree orig_reduc_input = reduc_inputs[0];
+
+  /* If this loop is an epilogue loop that can be skipped after the
+     main loop, we can only share a reduction operation between the
+     main loop and the epilogue if we put it at the target of the
+     skip edge.
+
+     We can still reuse accumulators if this check fails.  Doing so has
+     the minor(?) benefit of making the epilogue loop's scalar result
+     independent of the main loop's scalar result.  */
+  bool unify_with_main_loop_p = false;
+  if (reduc_info->reused_accumulator
+      && loop_vinfo->skip_this_loop_edge
+      && single_succ_p (exit_bb)
+      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
+    {
+      unify_with_main_loop_p = true;
+
+      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
+      reduc_inputs[0] = make_ssa_name (vectype);
+      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
+      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
+		   UNKNOWN_LOCATION);
+      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
+		   loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
+      exit_gsi = gsi_after_labels (reduc_block);
+    }
+
+  /* Shouldn't be used beyond this point.  */
+  exit_bb = nullptr;
+
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
@@ -5405,6 +5545,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5425,9 +5566,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (reduc_inputs.length () == 1);
       gcc_assert (pow2p_hwi (group_size));
 
-      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
-      vec<stmt_vec_info> orig_phis
-	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
       gimple_seq seq = NULL;
 
       /* Build a vector {0, 1, 2, ...}, with the same number of elements
@@ -5452,7 +5590,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	{
 	  tree initial_value = NULL_TREE;
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-	    initial_value = vect_phi_initial_value (orig_phis[0]);
+	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
 						 initial_value);
 	}
@@ -5466,7 +5604,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     for MIN and MAX reduction, for example.  */
 	  if (!neutral_op)
 	    {
-	      tree scalar_value = vect_phi_initial_value (orig_phis[i]);
+	      tree scalar_value = reduc_info->reduc_initial_values[i];
 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
 					     scalar_value);
 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5780,6 +5918,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tree tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       scalar_results[0] = new_temp;
     }
 
+  /* Record this operation if it could be reused by the epilogue loop.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
+      && !double_reduc)
+    loop_vinfo->reusable_accumulators.put (scalar_results[0],
+					   { orig_reduc_input, reduc_info });
+
   if (double_reduc)
     loop = outer_loop;
 
@@ -5886,6 +6031,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
         {
           /* Replace the uses:  */
           orig_name = PHI_RESULT (exit_phi);
+
+	  /* Look for a single use at the target of the skip edge.  */
+	  if (unify_with_main_loop_p)
+	    {
+	      use_operand_p use_p;
+	      gimple *user;
+	      if (!single_imm_use (orig_name, &use_p, &user))
+		gcc_unreachable ();
+	      orig_name = gimple_get_lhs (user);
+	    }
+
           scalar_result = scalar_results[k];
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
@@ -7421,16 +7577,32 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       else
 	{
 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
-	  tree initial_value = NULL_TREE;
+	  vec<tree> &initial_values = reduc_info->reduc_initial_values;
+	  vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+
+	  unsigned int num_phis = stmts.length ();
 	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
-	    initial_value = vect_phi_initial_value (phi);
-	  tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
-						      code, initial_value);
-	  get_initial_defs_for_reduction (loop_vinfo, reduc_info,
-					  slp_node_instance->reduc_phis,
-					  &vec_initial_defs, vec_num,
-					  initial_value != NULL, neutral_op);
+	    num_phis = 1;
+	  initial_values.reserve (num_phis);
+	  for (unsigned int i = 0; i < num_phis; ++i)
+	    {
+	      gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
+	      initial_values.quick_push (vect_phi_initial_value (this_phi));
+	    }
+	  if (vec_num == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!initial_values.is_empty ())
+	    {
+	      tree initial_value
+		= (num_phis == 1 ? initial_values[0] : NULL_TREE);
+	      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					    code, initial_value);
+	      get_initial_defs_for_reduction (loop_vinfo, reduc_info,
+					      &vec_initial_defs, vec_num,
+					      stmts.length (), neutral_op);
+	    }
 	}
     }
   else
@@ -7438,6 +7610,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       /* Get at the scalar def before the loop, that defines the initial
 	 value of the reduction variable.  */
       tree initial_def = vect_phi_initial_value (phi);
+      reduc_info->reduc_initial_values.safe_push (initial_def);
       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
 	 and we can't use zero for induc_val, use initial_def.  Similarly
 	 for REDUC_MIN and initial_def larger than the base.  */
@@ -7474,21 +7647,30 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					   initial_def, initial_def);
       else
 	{
-	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def),
-						      code, initial_def);
-	  gcc_assert (neutral_op);
-	  /* Try to simplify the vector initialization by applying an
-	     adjustment after the reduction has been performed.  */
-	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
-	      && !operand_equal_p (neutral_op, initial_def))
+	  if (ncopies == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!reduc_info->reduc_initial_values.is_empty ())
 	    {
-	      STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def;
-	      initial_def = neutral_op;
+	      initial_def = reduc_info->reduc_initial_values[0];
+	      enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (initial_def),
+					    code, initial_def);
+	      gcc_assert (neutral_op);
+	      /* Try to simplify the vector initialization by applying an
+		 adjustment after the reduction has been performed.  */
+	      if (!reduc_info->reused_accumulator
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+		  && !operand_equal_p (neutral_op, initial_def))
+		{
+		  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
+		    = initial_def;
+		  initial_def = neutral_op;
+		}
+	      vec_initial_def
+		= get_initial_def_for_reduction (loop_vinfo, reduc_info,
+						 initial_def, neutral_op);
 	    }
-	  vec_initial_def
-	    = get_initial_def_for_reduction (loop_vinfo, reduc_info,
-					     initial_def, neutral_op);
 	}
     }
 
@@ -7499,6 +7681,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	vec_initial_defs.quick_push (vec_initial_def);
     }
 
+  if (auto *accumulator = reduc_info->reused_accumulator)
+    {
+      if (loop_vinfo->main_loop_edge)
+	vec_initial_defs[0]
+	  = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input,
+				       vec_initial_defs[0]);
+      else
+	vec_initial_defs.safe_push (accumulator->reduc_input);
+      gcc_assert (vec_initial_defs.length () == 1);
+    }
+
   /* Generate the reduction PHIs upfront.  */
   for (i = 0; i < vec_num; i++)
     {
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
new file mode 100644
index 00000000000..fb817b73d77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
new file mode 100644
index 00000000000..1dd579be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_10.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0) != 0
+      || add_loop (x, 11) != 572
+      || add_loop (x, 0x100) != 22016
+      || add_loop (x, 0xfff) != 20480
+      || max_loop (x, 0) != 0
+      || max_loop (x, 11) != 132
+      || max_loop (x, 0x100) != 65280
+      || max_loop (x, 0xfff) != 65504
+      || or_loop (x, 0) != 0
+      || or_loop (x, 11) != 0xfe
+      || or_loop (x, 0x80) != 0x7ffe
+      || or_loop (x, 0xb4) != 0x7ffe
+      || or_loop (x, 0xb5) != 0xfffe
+      || eor_loop (x, 0) != 0
+      || eor_loop (x, 11) != 0xe8
+      || eor_loop (x, 0x100) != 0xcf00
+      || eor_loop (x, 0xfff) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0) != 65535
+      || min_loop (x, 11) != 65403
+      || min_loop (x, 0x100) != 255
+      || min_loop (x, 0xfff) != 31
+      || and_loop (x, 0) != 0xffff
+      || and_loop (x, 11) != 0xff01
+      || and_loop (x, 0x80) != 0x8001
+      || and_loop (x, 0xb4) != 0x8001
+      || and_loop (x, 0xb5) != 1)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
new file mode 100644
index 00000000000..f99ef4aa865
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
new file mode 100644
index 00000000000..5b41560d2ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_11.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 42) != 20522
+      || max_loop (x, 65503) != 65504
+      || max_loop (x, 65505) != 65505
+      || or_loop (x, 0) != 0xfffe
+      || or_loop (x, 1) != 0xffff
+      || eor_loop (x, 0) != 0xa000
+      || eor_loop (x, 0xbfff) != 0x1fff)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 32) != 31
+      || min_loop (x, 30) != 30
+      || and_loop (x, 0xff) != 1
+      || and_loop (x, 0) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
new file mode 100644
index 00000000000..d32b81a61bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
new file mode 100644
index 00000000000..929b81a9705
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
@@ -0,0 +1,66 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_12.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0, 10) != 10
+      || add_loop (x, 11, 42) != 614
+      || add_loop (x, 0x100, 84) != 22100
+      || add_loop (x, 0xfff, 20) != 20500
+      || max_loop (x, 0, 10) != 10
+      || max_loop (x, 11, 131) != 132
+      || max_loop (x, 11, 133) != 133
+      || max_loop (x, 0x100, 65279) != 65280
+      || max_loop (x, 0x100, 65281) != 65281
+      || max_loop (x, 0xfff, 65503) != 65504
+      || max_loop (x, 0xfff, 65505) != 65505
+      || or_loop (x, 0, 0x71) != 0x71
+      || or_loop (x, 11, 0) != 0xfe
+      || or_loop (x, 11, 0xb3c) != 0xbfe
+      || or_loop (x, 0x80, 0) != 0x7ffe
+      || or_loop (x, 0x80, 1) != 0x7fff
+      || or_loop (x, 0xb4, 0) != 0x7ffe
+      || or_loop (x, 0xb4, 1) != 0x7fff
+      || or_loop (x, 0xb5, 0) != 0xfffe
+      || or_loop (x, 0xb5, 1) != 0xffff
+      || eor_loop (x, 0, 0x3e) != 0x3e
+      || eor_loop (x, 11, 0) != 0xe8
+      || eor_loop (x, 11, 0x1ff) != 0x117
+      || eor_loop (x, 0x100, 0) != 0xcf00
+      || eor_loop (x, 0x100, 0xeee) != 0xc1ee
+      || eor_loop (x, 0xfff, 0) != 0xa000
+      || eor_loop (x, 0xfff, 0x8888) != 0x2888)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0, 10000) != 10000
+      || min_loop (x, 11, 65404) != 65403
+      || min_loop (x, 11, 65402) != 65402
+      || min_loop (x, 0x100, 256) != 255
+      || min_loop (x, 0x100, 254) != 254
+      || min_loop (x, 0xfff, 32) != 31
+      || min_loop (x, 0xfff, 30) != 30
+      || and_loop (x, 0, 0x1234) != 0x1234
+      || and_loop (x, 11, 0xffff) != 0xff01
+      || and_loop (x, 11, 0xcdef) != 0xcd01
+      || and_loop (x, 0x80, 0xffff) != 0x8001
+      || and_loop (x, 0x80, 0xfffe) != 0x8000
+      || and_loop (x, 0xb4, 0xffff) != 0x8001
+      || and_loop (x, 0xb4, 0xfffe) != 0x8000
+      || and_loop (x, 0xb5, 0xffff) != 1
+      || and_loop (x, 0xb5, 0xfffe) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
new file mode 100644
index 00000000000..ce2b8f2fcdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
@@ -0,0 +1,101 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
new file mode 100644
index 00000000000..5514d8d6b3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_13.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res[2] = { 42, 1111 };
+  add_loop (x, add_res);
+  if (add_res[0] != 968538154
+      || add_res[1] != 964340823)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 0, 0 };
+  max_loop (x, max_res1);
+  if (max_res1[0] != 1048150
+      || max_res1[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 1048151, 1045507 };
+  max_loop (x, max_res2);
+  if (max_res2[0] != 1048151
+      || max_res2[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int or_res[2] = { 0x1000000, 0x2000000 };
+  or_loop (x, or_res);
+  if (or_res[0] != 0x10ffffe
+      || or_res[1] != 0x20ffffe)
+    __builtin_abort ();
+
+  unsigned int eor_res[2] = { 0x1000000, 0x2000000 };
+  eor_loop (x, eor_res);
+  if (eor_res[0] != 0x1010000
+      || eor_res[1] != 0x20b5000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 500, 4000 };
+  min_loop (x, min_res1);
+  if (min_res1[0] != 425
+      || min_res1[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 424, 3068 };
+  min_loop (x, min_res2);
+  if (min_res2[0] != 424
+      || min_res2[1] != 3068)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
new file mode 100644
index 00000000000..3be611e4b37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
@@ -0,0 +1,107 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
new file mode 100644
index 00000000000..ccaa770e9b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
@@ -0,0 +1,187 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_14.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res1[2] = { 11, 22 };
+  add_loop (x, 0, add_res1);
+  if (add_res1[0] != 11
+      || add_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int add_res2[2] = { 10, 20 };
+  add_loop (x, 11, add_res2);
+  if (add_res2[0] != 1902
+      || add_res2[1] != 2176)
+    __builtin_abort ();
+
+  unsigned int add_res3[2] = { 15, 30 };
+  add_loop (x, 0x100, add_res3);
+  if (add_res3[0] != 22435087
+      || add_res3[1] != 22566686)
+    __builtin_abort ();
+
+  unsigned int add_res4[2] = { 100, 200 };
+  add_loop (x, 0x11f, add_res4);
+  if (add_res4[0] != 31602244
+      || add_res4[1] != 31767656)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 461, 500 };
+  max_loop (x, 11, max_res1);
+  if (max_res1[0] != 462
+      || max_res1[1] != 506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 463, 507 };
+  max_loop (x, 11, max_res2);
+  if (max_res2[0] != 463
+      || max_res2[1] != 507)
+    __builtin_abort ();
+
+  unsigned int max_res3[2] = { 1000000, 1000000 };
+  max_loop (x, 0x200, max_res3);
+  if (max_res3[0] != 1047552
+      || max_res3[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res4[2] = { 1047553, 1045507 };
+  max_loop (x, 0x200, max_res4);
+  if (max_res4[0] != 1047553
+      || max_res4[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int max_res5[2] = { 300000, 30000 };
+  max_loop (x, 0x11f, max_res5);
+  if (max_res5[0] != 328902
+      || max_res5[1] != 330050)
+    __builtin_abort ();
+
+  unsigned int max_res6[2] = { 328903, 330051 };
+  max_loop (x, 0x11f, max_res6);
+  if (max_res6[0] != 328903
+      || max_res6[1] != 330051)
+    __builtin_abort ();
+
+  unsigned int or_res1[2] = { 11, 22 };
+  or_loop (x, 0, or_res1);
+  if (or_res1[0] != 11
+      || or_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int or_res2[2] = { 0x200000, 0xe00000 };
+  or_loop (x, 11, or_res2);
+  if (or_res2[0] != 0x2001fe
+      || or_res2[1] != 0xe001fe)
+    __builtin_abort ();
+
+  unsigned int or_res3[2] = { 0x800000, 0x700000 };
+  or_loop (x, 0x40, or_res3);
+  if (or_res3[0] != 0x803ffe
+      || or_res3[1] != 0x707ffe)
+    __builtin_abort ();
+
+  unsigned int or_res4[2] = { 0x100001, 0x300000 };
+  or_loop (x, 0x4f, or_res4);
+  if (or_res4[0] != 0x107fff
+      || or_res4[1] != 0x307ffe)
+    __builtin_abort ();
+
+  unsigned int eor_res1[2] = { 11, 22 };
+  eor_loop (x, 0, eor_res1);
+  if (eor_res1[0] != 11
+      || eor_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff };
+  eor_loop (x, 11, eor_res2);
+  if (eor_res2[0] != 0x2001cf
+      || eor_res2[1] != 0xe000b7)
+    __builtin_abort ();
+
+  unsigned int eor_res3[2] = { 0x805000, 0x70f000 };
+  eor_loop (x, 0x100, eor_res3);
+  if (eor_res3[0] != 0x824200
+      || eor_res3[1] != 0x77dc00)
+    __builtin_abort ();
+
+  unsigned int eor_res4[2] = { 0x101201, 0x300f00 };
+  eor_loop (x, 0x11f, eor_res4);
+  if (eor_res4[0] != 0x178801
+      || eor_res4[1] != 0x337240)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 1048200, 1048100 };
+  min_loop (x, 11, min_res1);
+  if (min_res1[0] != 1048113
+      || min_res1[1] != 1048069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 1048112, 1048068 };
+  min_loop (x, 11, min_res2);
+  if (min_res2[0] != 1048112
+      || min_res2[1] != 1048068)
+    __builtin_abort ();
+
+  unsigned int min_res3[2] = { 10000, 10000 };
+  min_loop (x, 0x200, min_res3);
+  if (min_res3[0] != 1023
+      || min_res3[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res4[2] = { 1022, 3068 };
+  min_loop (x, 0x200, min_res4);
+  if (min_res4[0] != 1022
+      || min_res4[1] != 3068)
+    __builtin_abort ();
+
+  unsigned int min_res5[2] = { 719680, 718530 };
+  min_loop (x, 0x11f, min_res5);
+  if (min_res5[0] != 719673
+      || min_res5[1] != 718525)
+    __builtin_abort ();
+
+  unsigned int min_res6[2] = { 719672, 718524 };
+  min_loop (x, 0x11f, min_res6);
+  if (min_res6[0] != 719672
+      || min_res6[1] != 718524)
+    __builtin_abort ();
+
+  unsigned int and_res1[2] = { 11, 22 };
+  and_loop (x, 0, and_res1);
+  if (and_res1[0] != 11
+      || and_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int and_res2[2] = { 0xf5cff, 0xf78ff };
+  and_loop (x, 11, and_res2);
+  if (and_res2[0] != 0xf5c01
+      || and_res2[1] != 0xf7801)
+    __builtin_abort ();
+
+  unsigned int and_res3[2] = { 0x7efff, 0xecfff };
+  and_loop (x, 0x40, and_res3);
+  if (and_res3[0] != 0x7c001
+      || and_res3[1] != 0xe8001)
+    __builtin_abort ();
+
+  unsigned int and_res4[2] = { 0xffffff, 0xffffff };
+  and_loop (x, 0x4f, and_res4);
+  if (and_res4[0] != 0xf8001
+      || and_res4[1] != 0xf8001)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
new file mode 100644
index 00000000000..15b1ade30e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
@@ -0,0 +1,16 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+int __attribute__((noipa))
+add_loop (int *x, int n, int res)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      res += x[i * 2];
+      res += x[i * 2 + 1];
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
new file mode 100644
index 00000000000..3207fce5be3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
@@ -0,0 +1,22 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_15.c"
+
+int
+main (void)
+{
+  int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  if (add_loop (x, 0, 33) != 33
+      || add_loop (x, 11, 30) != 4078
+      || add_loop (x, 0x100, 45) != 45001773
+      || add_loop (x, 0x11f, 300) != 63369900)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
new file mode 100644
index 00000000000..b839821d6bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
new file mode 100644
index 00000000000..aa248f53eaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_9.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x) != 20480
+      || max_loop (x) != 65504
+      || or_loop (x) != 0xfffe
+      || eor_loop (x) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x) != 31
+      || and_loop (x) != 1)
+    __builtin_abort ();
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 10/10] vect: Reuse reduction accumulators between loops
  2021-07-08 12:43 ` [PATCH 10/10] vect: Reuse reduction accumulators between loops Richard Sandiford
@ 2021-07-09 11:58   ` Richard Biener
  2021-07-09 13:12     ` Richard Sandiford
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Biener @ 2021-07-09 11:58 UTC (permalink / raw)
  To: Richard Sandiford, GCC Patches

On Thu, Jul 8, 2021 at 2:50 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This patch adds support for reusing a main loop's reduction accumulator
> in an epilogue loop.  This in turn lets the loops share a single piece
> of vector->scalar reduction code.
>
> The patch has the following restrictions:
>
> (1) The epilogue reduction can only operate on a single vector
>     (e.g. ncopies must be 1 for non-SLP reductions, and the group size
>     must be <= the element count for SLP reductions).
>
> (2) Both loops must use the same vector mode for their accumulators.
>     This means that the patch is restricted to targets that support
>     --param vect-partial-vector-usage=1.
>
> (3) The reduction must be a standard “tree code” reduction.
>
> However, these restrictions could be lifted in future.  For example,
> if the main loop operates on 128-bit vectors and the epilogue loop
> operates on 64-bit vectors, we could in future reduce the 128-bit
> vector by one stage and use the 64-bit result as the starting point
> for the epilogue result.

Yeah, I hope that can be done quickly - it should make the
approach usable on x86_64.

> The patch tries to handle chained SLP reductions, unchained SLP
> reductions and non-SLP reductions.  It also handles cases in which
> the epilogue loop is entered directly (rather than via the main loop)
> and cases in which the epilogue loop can be skipped.
>
> vect_get_main_loop_result is a bit more general than the current
> patch needs.

I didn't see anything that would adjust the costing of the vectorization
(though I don't specifically remember how we cost vectorized epilogues
in general).

Few comments / questions inline below - I think the patch is OK
as-is though.

Thanks,
Richard.

> gcc/
>         * tree-vectorizer.h (vect_reusable_accumulator): New structure.
>         (_loop_vec_info::main_loop_edge): New field.
>         (_loop_vec_info::skip_main_loop_edge): Likewise.
>         (_loop_vec_info::skip_this_loop_edge): Likewise.
>         (_loop_vec_info::reusable_accumulators): Likewise.
>         (_stmt_vec_info::reduc_scalar_results): Likewise.
>         (_stmt_vec_info::reused_accumulator): Likewise.
>         (vect_get_main_loop_result): Declare.
>         * tree-vectorizer.c (vec_info::new_stmt_vec_info): Initialize
>         reduc_scalar_inputs.
>         (vec_info::free_stmt_vec_info): Free reduc_scalar_inputs.
>         * tree-vect-loop-manip.c (vect_get_main_loop_result): New function.
>         (vect_do_peeling): Fill an epilogue loop's main_loop_edge,
>         skip_main_loop_edge and skip_this_loop_edge fields.
>         * tree-vect-loop.c (INCLUDE_ALGORITHM): Define.
>         (vect_emit_reduction_init_stmts): New function.
>         (get_initial_def_for_reduction): Use it.
>         (get_initial_defs_for_reduction): Likewise.  Change the vinfo
>         parameter to a loop_vec_info.
>         (vect_create_epilog_for_reduction): Store the scalar results
>         in the reduc_info.  If an epilogue loop is reusing an accumulator
>         from the main loop, and if the epilogue loop can also be skipped,
>         try to place the reduction code in the join block.  Record
>         accumulators that could potentially be reused by epilogue loops.
>         (vect_transform_cycle_phi): When vectorizing epilogue loops,
>         try to reuse accumulators from the main loop.  Record the initial
>         value in reduc_info for non-SLP reductions too.
>
> gcc/testsuite/
>         * gcc.target/aarch64/sve/reduc_9.c: New test.
>         * gcc.target/aarch64/sve/reduc_9_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_10.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_10_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_11.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_11_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_12.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_12_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_13.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_13_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_14.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_14_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_15.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_15_run.c: Likewise.
> ---
>  .../gcc.target/aarch64/sve/reduc_10.c         |  77 +++++
>  .../gcc.target/aarch64/sve/reduc_10_run.c     |  49 +++
>  .../gcc.target/aarch64/sve/reduc_11.c         |  71 ++++
>  .../gcc.target/aarch64/sve/reduc_11_run.c     |  34 ++
>  .../gcc.target/aarch64/sve/reduc_12.c         |  71 ++++
>  .../gcc.target/aarch64/sve/reduc_12_run.c     |  66 ++++
>  .../gcc.target/aarch64/sve/reduc_13.c         | 101 ++++++
>  .../gcc.target/aarch64/sve/reduc_13_run.c     |  61 ++++
>  .../gcc.target/aarch64/sve/reduc_14.c         | 107 ++++++
>  .../gcc.target/aarch64/sve/reduc_14_run.c     | 187 +++++++++++
>  .../gcc.target/aarch64/sve/reduc_15.c         |  16 +
>  .../gcc.target/aarch64/sve/reduc_15_run.c     |  22 ++
>  .../gcc.target/aarch64/sve/reduc_9.c          |  77 +++++
>  .../gcc.target/aarch64/sve/reduc_9_run.c      |  29 ++
>  gcc/tree-vect-loop-manip.c                    |  29 ++
>  gcc/tree-vect-loop.c                          | 309 ++++++++++++++----
>  gcc/tree-vectorizer.c                         |   4 +
>  gcc/tree-vectorizer.h                         |  51 ++-
>  18 files changed, 1297 insertions(+), 64 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
>
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index e2fd3609fee..ed7a7738880 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens;
>
>  typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
>
> +/* Information about a reduction accumulator from the main loop that could
> +   conceivably be reused as the input to a reduction in an epilogue loop.  */
> +struct vect_reusable_accumulator {
> +  /* The final value of the accumulator, which forms the input to the
> +     reduction operation.  */
> +  tree reduc_input;
> +
> +  /* The stmt_vec_info that describes the reduction (i.e. the one for
> +     which is_reduc_info is true).  */
> +  stmt_vec_info reduc_info;
> +};
> +
>  /*-----------------------------------------------------------------*/
>  /* Info on vectorized loops.                                       */
>  /*-----------------------------------------------------------------*/
> @@ -588,6 +600,23 @@ public:
>    /* Unrolling factor  */
>    poly_uint64 vectorization_factor;
>
> +  /* If this loop is an epilogue loop whose main loop can be skipped,
> +     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
> +     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
> +     main loop and goes straight to this loop's preheader.
> +
> +     Both fields are null otherwise.  */
> +  edge main_loop_edge;
> +  edge skip_main_loop_edge;
> +
> +  /* If this loop is an epilogue loop that might be skipped after executing
> +     the main loop, this edge is the one that skips the epilogue.  */
> +  edge skip_this_loop_edge;
> +
> +  /* After vectorization, maps live-out SSA names to information about
> +     the reductions that generated them.  */
> +  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;

Is that the LC PHI node defs or the definition inside of the loop?
If the latter we could attach the info directly to its stmt-info?

> +
>    /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
>       if there is no particular limit.  */
>    unsigned HOST_WIDE_INT max_vectorization_factor;
> @@ -1186,6 +1215,21 @@ public:
>    /* The vector type for performing the actual reduction.  */
>    tree reduc_vectype;
>
> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
> +     elements in parallel, this vector gives the initial values of these
> +     N elements.  */

That's N scalar elements or N vector elements?  I suppose it's for
SLP reductions (rather than SLP reduction chains) and never non-SLP
reductions?

> +  vec<tree> reduc_initial_values;
> +
> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
> +     elements in parallel, this vector gives the scalar result of each
> +     reduction.  */
> +  vec<tree> reduc_scalar_results;
> +
> +  /* Only meaningful if IS_REDUC_INFO.  If non-null, the reduction is
> +     being performed by an epilogue loop and we have decided to reuse
> +     this accumulator from the main loop.  */
> +  vect_reusable_accumulator *reused_accumulator;
> +
>    /* Whether we force a single cycle PHI during reduction vectorization.  */
>    bool force_single_cycle;
>
> @@ -1382,12 +1426,6 @@ vect_phi_initial_value (gphi *phi)
>    return PHI_ARG_DEF_FROM_EDGE (phi, pe);
>  }
>
> -static inline tree
> -vect_phi_initial_value (stmt_vec_info stmt_info)
> -{
> -  return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
> -}
> -
>  /* Return true if STMT_INFO should produce a vector mask type rather than
>     a normal nonmask type.  */
>
> @@ -1818,6 +1856,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *);
>  extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
>                                     tree *, tree *, tree *, int, bool, bool,
>                                     tree *);
> +extern tree vect_get_main_loop_result (loop_vec_info, tree, tree = NULL_TREE);
>  extern void vect_prepare_for_masked_peels (loop_vec_info);
>  extern dump_user_location_t find_loop_location (class loop *);
>  extern bool vect_can_advance_ivs_p (loop_vec_info);
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index 9748043f3ee..f1035a83826 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt)
>    STMT_VINFO_SLP_VECT_ONLY (res) = false;
>    STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false;
>    STMT_VINFO_VEC_STMTS (res) = vNULL;
> +  res->reduc_initial_values = vNULL;
> +  res->reduc_scalar_results = vNULL;
>
>    if (is_a <loop_vec_info> (this)
>        && gimple_code (stmt) == GIMPLE_PHI
> @@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
>         release_ssa_name (lhs);
>      }
>
> +  stmt_info->reduc_initial_values.release ();
> +  stmt_info->reduc_scalar_results.release ();
>    STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release ();
>    STMT_VINFO_VEC_STMTS (stmt_info).release ();
>    free (stmt_info);
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 2909e8a0fc3..b7b0523e3c8 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
>    return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
>  }
>
> +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
> +   from the corresponding main loop.  Return a value that is available in
> +   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
> +   Passing a null SKIP_VALUE is equivalent to passing zero.  */
> +
> +tree
> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
> +                          tree skip_value)
> +{
> +  if (!loop_vinfo->main_loop_edge)
> +    return main_loop_value;
> +
> +  if (!skip_value)
> +    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));

shouldn't that be the initial value?

> +
> +  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
> +  basic_block bb = loop_vinfo->main_loop_edge->dest;
> +  gphi *new_phi = create_phi_node (phi_result, bb);
> +  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
> +              UNKNOWN_LOCATION);
> +  add_phi_arg (new_phi, skip_value,
> +              loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
> +  return phi_result;
> +}
> +
>  /* Function vect_do_peeling.
>
>     Input:
> @@ -2986,6 +3011,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>                                            skip_vector ? anchor : guard_bb,
>                                            prob_epilog.invert (),
>                                            irred_flag);
> +         if (vect_epilogues)
> +           epilogue_vinfo->skip_this_loop_edge = guard_e;
>           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
>                                               single_exit (epilog));
>           /* Only need to handle basic block before epilog loop if it's not
> @@ -3057,6 +3084,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
>                        UNKNOWN_LOCATION);
>           niters = PHI_RESULT (new_phi);
> +         epilogue_vinfo->main_loop_edge = update_e;
> +         epilogue_vinfo->skip_main_loop_edge = skip_e;
>         }
>
>        /* Set ADVANCE to the number of iterations performed by the previous
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index fe7e73f655f..5e6c9b7c38a 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License
>  along with GCC; see the file COPYING3.  If not see
>  <http://www.gnu.org/licenses/>.  */
>
> +#define INCLUDE_ALGORITHM
>  #include "config.h"
>  #include "system.h"
>  #include "coretypes.h"
> @@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      th (0),
>      versioning_threshold (0),
>      vectorization_factor (0),
> +    main_loop_edge (nullptr),
> +    skip_main_loop_edge (nullptr),
> +    skip_this_loop_edge (nullptr),
> +    reusable_accumulators (),
>      max_vectorization_factor (0),
>      mask_skip_niters (NULL_TREE),
>      rgroup_compare_type (NULL_TREE),
> @@ -4607,7 +4612,32 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>                   prologue_cost, epilogue_cost);
>  }
>
> +/* SEQ is a sequence of instructions that initialize the reduction
> +   described by REDUC_INFO.  Emit them in the appropriate place.  */
>
> +static void
> +vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
> +                               stmt_vec_info reduc_info, gimple *seq)
> +{
> +  if (reduc_info->reused_accumulator)
> +    {
> +      /* When reusing an accumulator from the main loop, we only need
> +        initialization instructions if the main loop can be skipped.
> +        In that case, emit the initialization instructions at the end
> +        of the guard block that does the skip.  */
> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
> +      gcc_assert (skip_edge);
> +      gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
> +      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
> +    }
> +  else
> +    {
> +      /* The normal case: emit the initialization instructions on the
> +        preheader edge.  */
> +      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> +      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
> +    }
> +}
>
>  /* Function get_initial_def_for_reduction
>
> @@ -4675,36 +4705,30 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
>      }
>
>    if (stmts)
> -    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
> +    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
>    return init_def;
>  }
>
> -/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
> -   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
> -   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
> -   that value will not change the result.  */
> +/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
> +   which performs a reduction involving GROUP_SIZE scalar statements.
> +   NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
> +   is nonnull, introducing extra elements of that value will not change the
> +   result.  */
>
>  static void
> -get_initial_defs_for_reduction (vec_info *vinfo,
> +get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
>                                 stmt_vec_info reduc_info,
> -                               slp_tree slp_node,
>                                 vec<tree> *vec_oprnds,
>                                 unsigned int number_of_vectors,
> -                               bool reduc_chain, tree neutral_op)
> +                               unsigned int group_size, tree neutral_op)
>  {
> -  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> +  vec<tree> &initial_values = reduc_info->reduc_initial_values;
>    unsigned HOST_WIDE_INT nunits;
>    unsigned j, number_of_places_left_in_vector;
>    tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
> -  unsigned int group_size = stmts.length ();
>    unsigned int i;
> -  class loop *loop;
> -
> -  loop = (gimple_bb (reduc_info->stmt))->loop_father;
> -  gcc_assert (loop);
> -  edge pe = loop_preheader_edge (loop);
>
> -  gcc_assert (!reduc_chain || neutral_op);
> +  gcc_assert (group_size == initial_values.length () || neutral_op);
>
>    /* NUMBER_OF_COPIES is the number of times we need to use the same values in
>       created vectors. It is greater than 1 if unrolling is performed.
> @@ -4734,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>      {
>        tree op;
>        i = j % group_size;
> -      stmt_vec_info stmt_vinfo = stmts[i];
>
>        /* Get the def before the loop.  In reduction chain we have only
>          one initial value.  Else we have as many as PHIs in the group.  */
> -      if (reduc_chain)
> -       op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
> -      else if (((vec_oprnds->length () + 1) * nunits
> -               - number_of_places_left_in_vector >= group_size)
> -              && neutral_op)
> +      if (i >= initial_values.length () || (j > i && neutral_op))
>         op = neutral_op;
>        else
> -       op = vect_phi_initial_value (stmt_vinfo);
> +       op = initial_values[i];
>
>        /* Create 'vect_ = {op0,op1,...,opn}'.  */
>        number_of_places_left_in_vector--;
> @@ -4781,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>             {
>               /* First time round, duplicate ELTS to fill the
>                  required number of vectors.  */
> -             duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
> -                                       number_of_vectors, *vec_oprnds);
> +             duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
> +                                       elts, number_of_vectors, *vec_oprnds);
>               break;
>             }
>           vec_oprnds->quick_push (init);
> @@ -4794,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>         }
>      }
>    if (ctor_seq != NULL)
> -    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
> +    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
>  }
>
>  /* For a statement STMT_INFO taking part in a reduction operation return
> @@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
>    return stmt_info;
>  }
>
> +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
> +   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
> +   matching reduction that we can build on.  Adjust REDUC_INFO and return true
> +   if so, otherwise return false.  */
> +
> +static bool
> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
> +                               stmt_vec_info reduc_info)
> +{
> +  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
> +  if (!main_loop_vinfo)
> +    return false;
> +
> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
> +    return false;
> +
> +  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
> +  auto_vec<tree, 16> main_loop_results (num_phis);
> +  auto_vec<tree, 16> initial_values (num_phis);
> +  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
> +    {
> +      /* The epilogue loop can be entered either from the main loop or
> +        from an earlier guard block.  */
> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
> +      for (tree incoming_value : reduc_info->reduc_initial_values)
> +       {
> +         /* Look for:
> +
> +              INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
> +                                   INITIAL_VALUE(guard block)>.  */
> +         gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
> +
> +         gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
> +         gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
> +
> +         tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
> +         tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
> +
> +         main_loop_results.quick_push (from_main_loop);
> +         initial_values.quick_push (from_skip);
> +       }
> +    }
> +  else
> +    /* The main loop dominates the epilogue loop.  */
> +    main_loop_results.splice (reduc_info->reduc_initial_values);
> +
> +  /* See if the main loop has the kind of accumulator we need.  */
> +  vect_reusable_accumulator *accumulator
> +    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
> +  if (!accumulator
> +      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
> +      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
> +                     accumulator->reduc_info->reduc_scalar_results.begin ()))
> +    return false;
> +
> +  /* For now, only handle the case in which both loops are operating on the
> +     same vector types.  In future we could reduce wider vectors to narrower
> +     ones as well.  */
> +  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
> +  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
> +  if (!useless_type_conversion_p (old_vectype, vectype))

It should be indeed quite trivial to handle, likewise the case where we
have multiple PHIs - just reduce to a single input vector and have the
possibly multiple input vectors in the epilogue filled with neutral
elements.  I'll see if I can cook up stuff for this next week.

> +    return false;
> +
> +  /* Non-SLP reductions might apply an adjustment after the reduction
> +     operation, in order to simplify the initialization of the accumulator.
> +     If the epilogue loop carries on from where the main loop left off,
> +     it should apply the same adjustment to the final reduction result.
> +
> +     If the epilogue loop can also be entered directly (rather than via
> +     the main loop), we need to be able to handle that case in the same way,
> +     with the same adjustment.  (In principle we could add a PHI node
> +     to select the correct adjustment, but in practice that shouldn't be
> +     necessary.)  */
> +  tree main_adjustment
> +    = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
> +  if (loop_vinfo->main_loop_edge && main_adjustment)
> +    {
> +      gcc_assert (num_phis == 1);
> +      tree initial_value = initial_values[0];
> +      /* Check that we can use INITIAL_VALUE as the adjustment and
> +        initialize the accumulator with a neutral value instead.  */
> +      if (!operand_equal_p (initial_value, main_adjustment))
> +       return false;
> +      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> +      initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
> +                                                   code, initial_value);
> +    }
> +  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
> +  reduc_info->reduc_initial_values.truncate (0);
> +  reduc_info->reduc_initial_values.splice (initial_values);
> +  reduc_info->reused_accumulator = accumulator;
> +  return true;
> +}
> +
>  /* Function vect_create_epilog_for_reduction
>
>     Create code at the loop-epilog to finalize the result of a reduction
> @@ -4915,7 +5028,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    gimple *use_stmt;
>    auto_vec<tree> reduc_inputs;
>    int j, i;
> -  auto_vec<tree> scalar_results;
> +  vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
>    unsigned int group_size = 1, k;
>    auto_vec<gimple *> phis;
>    /* SLP reduction without reduction chain, e.g.,
> @@ -4941,16 +5054,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    gcc_assert (vectype);
>    mode = TYPE_MODE (vectype);
>
> -  tree initial_def = NULL;
>    tree induc_val = NULL_TREE;
>    tree adjustment_def = NULL;
>    if (slp_node)
>      ;
>    else
>      {
> -      /* Get at the scalar def before the loop, that defines the initial value
> -        of the reduction variable.  */
> -      initial_def = vect_phi_initial_value (reduc_def_stmt);
>        /* Optimize: for induction condition reduction, if we can't use zero
>           for induc_val, use initial_def.  */
>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
> @@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        reduc_inputs.safe_push (single_input);
>      }
>
> +  tree orig_reduc_input = reduc_inputs[0];
> +
> +  /* If this loop is an epilogue loop that can be skipped after the
> +     main loop, we can only share a reduction operation between the
> +     main loop and the epilogue if we put it at the target of the
> +     skip edge.

Do you have a testcase where we cannot do this?

> +     We can still reuse accumulators if this check fails.  Doing so has
> +     the minor(?) benefit of making the epilogue loop's scalar result
> +     independent of the main loop's scalar result.  */
> +  bool unify_with_main_loop_p = false;
> +  if (reduc_info->reused_accumulator
> +      && loop_vinfo->skip_this_loop_edge
> +      && single_succ_p (exit_bb)
> +      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
> +    {
> +      unify_with_main_loop_p = true;
> +
> +      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
> +      reduc_inputs[0] = make_ssa_name (vectype);
> +      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
> +      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
> +                  UNKNOWN_LOCATION);
> +      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
> +                  loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
> +      exit_gsi = gsi_after_labels (reduc_block);
> +    }
> +
> +  /* Shouldn't be used beyond this point.  */
> +  exit_bb = nullptr;
> +
>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
>        && reduc_fn != IFN_LAST)
>      {
> @@ -5405,6 +5545,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              the same as initial_def already.  */
>           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
>                                   induc_val);
> +         tree initial_def = reduc_info->reduc_initial_values[0];
>
>           tmp = make_ssa_name (new_scalar_dest);
>           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
> @@ -5425,9 +5566,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (reduc_inputs.length () == 1);
>        gcc_assert (pow2p_hwi (group_size));
>
> -      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
> -      vec<stmt_vec_info> orig_phis
> -       = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
>        gimple_seq seq = NULL;
>
>        /* Build a vector {0, 1, 2, ...}, with the same number of elements
> @@ -5452,7 +5590,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>         {
>           tree initial_value = NULL_TREE;
>           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
> -           initial_value = vect_phi_initial_value (orig_phis[0]);
> +           initial_value = reduc_info->reduc_initial_values[0];
>           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
>                                                  initial_value);
>         }
> @@ -5466,7 +5604,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              for MIN and MAX reduction, for example.  */
>           if (!neutral_op)
>             {
> -             tree scalar_value = vect_phi_initial_value (orig_phis[i]);
> +             tree scalar_value = reduc_info->reduc_initial_values[i];
>               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
>                                              scalar_value);
>               vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -5780,6 +5918,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              the same as initial_def already.  */
>           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
>                                   induc_val);
> +         tree initial_def = reduc_info->reduc_initial_values[0];
>
>           tree tmp = make_ssa_name (new_scalar_dest);
>           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
> @@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        scalar_results[0] = new_temp;
>      }
>
> +  /* Record this operation if it could be reused by the epilogue loop.  */
> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
> +      && !double_reduc)

what's the issue with double_reduc?

> +    loop_vinfo->reusable_accumulators.put (scalar_results[0],
> +                                          { orig_reduc_input, reduc_info });
> +
>    if (double_reduc)
>      loop = outer_loop;
>
> @@ -5886,6 +6031,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>          {
>            /* Replace the uses:  */
>            orig_name = PHI_RESULT (exit_phi);
> +
> +         /* Look for a single use at the target of the skip edge.  */
> +         if (unify_with_main_loop_p)
> +           {
> +             use_operand_p use_p;
> +             gimple *user;
> +             if (!single_imm_use (orig_name, &use_p, &user))
> +               gcc_unreachable ();
> +             orig_name = gimple_get_lhs (user);
> +           }
> +
>            scalar_result = scalar_results[k];
>            FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
>             {
> @@ -7421,16 +7577,32 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>        else
>         {
>           gcc_assert (slp_node == slp_node_instance->reduc_phis);
> -         tree initial_value = NULL_TREE;
> +         vec<tree> &initial_values = reduc_info->reduc_initial_values;
> +         vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> +
> +         unsigned int num_phis = stmts.length ();
>           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
> -           initial_value = vect_phi_initial_value (phi);
> -         tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> -         tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
> -                                                     code, initial_value);
> -         get_initial_defs_for_reduction (loop_vinfo, reduc_info,
> -                                         slp_node_instance->reduc_phis,
> -                                         &vec_initial_defs, vec_num,
> -                                         initial_value != NULL, neutral_op);
> +           num_phis = 1;
> +         initial_values.reserve (num_phis);
> +         for (unsigned int i = 0; i < num_phis; ++i)
> +           {
> +             gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
> +             initial_values.quick_push (vect_phi_initial_value (this_phi));
> +           }
> +         if (vec_num == 1)
> +           vect_find_reusable_accumulator (loop_vinfo, reduc_info);
> +         if (!initial_values.is_empty ())
> +           {
> +             tree initial_value
> +               = (num_phis == 1 ? initial_values[0] : NULL_TREE);
> +             tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> +             tree neutral_op
> +               = neutral_op_for_reduction (TREE_TYPE (vectype_out),
> +                                           code, initial_value);
> +             get_initial_defs_for_reduction (loop_vinfo, reduc_info,
> +                                             &vec_initial_defs, vec_num,
> +                                             stmts.length (), neutral_op);
> +           }
>         }
>      }
>    else
> @@ -7438,6 +7610,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>        /* Get at the scalar def before the loop, that defines the initial
>          value of the reduction variable.  */
>        tree initial_def = vect_phi_initial_value (phi);
> +      reduc_info->reduc_initial_values.safe_push (initial_def);
>        /* Optimize: if initial_def is for REDUC_MAX smaller than the base
>          and we can't use zero for induc_val, use initial_def.  Similarly
>          for REDUC_MIN and initial_def larger than the base.  */
> @@ -7474,21 +7647,30 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>                                            initial_def, initial_def);
>        else
>         {
> -         enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> -         tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def),
> -                                                     code, initial_def);
> -         gcc_assert (neutral_op);
> -         /* Try to simplify the vector initialization by applying an
> -            adjustment after the reduction has been performed.  */
> -         if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
> -             && !operand_equal_p (neutral_op, initial_def))
> +         if (ncopies == 1)
> +           vect_find_reusable_accumulator (loop_vinfo, reduc_info);
> +         if (!reduc_info->reduc_initial_values.is_empty ())
>             {
> -             STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def;
> -             initial_def = neutral_op;
> +             initial_def = reduc_info->reduc_initial_values[0];
> +             enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> +             tree neutral_op
> +               = neutral_op_for_reduction (TREE_TYPE (initial_def),
> +                                           code, initial_def);
> +             gcc_assert (neutral_op);
> +             /* Try to simplify the vector initialization by applying an
> +                adjustment after the reduction has been performed.  */
> +             if (!reduc_info->reused_accumulator
> +                 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
> +                 && !operand_equal_p (neutral_op, initial_def))
> +               {
> +                 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
> +                   = initial_def;
> +                 initial_def = neutral_op;
> +               }
> +             vec_initial_def
> +               = get_initial_def_for_reduction (loop_vinfo, reduc_info,
> +                                                initial_def, neutral_op);
>             }
> -         vec_initial_def
> -           = get_initial_def_for_reduction (loop_vinfo, reduc_info,
> -                                            initial_def, neutral_op);
>         }
>      }
>
> @@ -7499,6 +7681,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>         vec_initial_defs.quick_push (vec_initial_def);
>      }
>
> +  if (auto *accumulator = reduc_info->reused_accumulator)
> +    {
> +      if (loop_vinfo->main_loop_edge)
> +       vec_initial_defs[0]
> +         = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input,
> +                                      vec_initial_defs[0]);
> +      else
> +       vec_initial_defs.safe_push (accumulator->reduc_input);
> +      gcc_assert (vec_initial_defs.length () == 1);
> +    }
> +
>    /* Generate the reduction PHIs upfront.  */
>    for (i = 0; i < vec_num; i++)
>      {
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
> new file mode 100644
> index 00000000000..fb817b73d77
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
> @@ -0,0 +1,77 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < n; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < n; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
> new file mode 100644
> index 00000000000..1dd579be701
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_10.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x, 0) != 0
> +      || add_loop (x, 11) != 572
> +      || add_loop (x, 0x100) != 22016
> +      || add_loop (x, 0xfff) != 20480
> +      || max_loop (x, 0) != 0
> +      || max_loop (x, 11) != 132
> +      || max_loop (x, 0x100) != 65280
> +      || max_loop (x, 0xfff) != 65504
> +      || or_loop (x, 0) != 0
> +      || or_loop (x, 11) != 0xfe
> +      || or_loop (x, 0x80) != 0x7ffe
> +      || or_loop (x, 0xb4) != 0x7ffe
> +      || or_loop (x, 0xb5) != 0xfffe
> +      || eor_loop (x, 0) != 0
> +      || eor_loop (x, 11) != 0xe8
> +      || eor_loop (x, 0x100) != 0xcf00
> +      || eor_loop (x, 0xfff) != 0xa000)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x, 0) != 65535
> +      || min_loop (x, 11) != 65403
> +      || min_loop (x, 0x100) != 255
> +      || min_loop (x, 0xfff) != 31
> +      || and_loop (x, 0) != 0xffff
> +      || and_loop (x, 11) != 0xff01
> +      || and_loop (x, 0x80) != 0x8001
> +      || and_loop (x, 0xb4) != 0x8001
> +      || and_loop (x, 0xb5) != 1)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
> new file mode 100644
> index 00000000000..f99ef4aa865
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
> @@ -0,0 +1,71 @@
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
> new file mode 100644
> index 00000000000..5b41560d2ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
> @@ -0,0 +1,34 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_11.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x, 42) != 20522
> +      || max_loop (x, 65503) != 65504
> +      || max_loop (x, 65505) != 65505
> +      || or_loop (x, 0) != 0xfffe
> +      || or_loop (x, 1) != 0xffff
> +      || eor_loop (x, 0) != 0xa000
> +      || eor_loop (x, 0xbfff) != 0x1fff)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x, 32) != 31
> +      || min_loop (x, 30) != 30
> +      || and_loop (x, 0xff) != 1
> +      || and_loop (x, 0) != 0)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
> new file mode 100644
> index 00000000000..d32b81a61bc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
> @@ -0,0 +1,71 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
> new file mode 100644
> index 00000000000..929b81a9705
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
> @@ -0,0 +1,66 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_12.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x, 0, 10) != 10
> +      || add_loop (x, 11, 42) != 614
> +      || add_loop (x, 0x100, 84) != 22100
> +      || add_loop (x, 0xfff, 20) != 20500
> +      || max_loop (x, 0, 10) != 10
> +      || max_loop (x, 11, 131) != 132
> +      || max_loop (x, 11, 133) != 133
> +      || max_loop (x, 0x100, 65279) != 65280
> +      || max_loop (x, 0x100, 65281) != 65281
> +      || max_loop (x, 0xfff, 65503) != 65504
> +      || max_loop (x, 0xfff, 65505) != 65505
> +      || or_loop (x, 0, 0x71) != 0x71
> +      || or_loop (x, 11, 0) != 0xfe
> +      || or_loop (x, 11, 0xb3c) != 0xbfe
> +      || or_loop (x, 0x80, 0) != 0x7ffe
> +      || or_loop (x, 0x80, 1) != 0x7fff
> +      || or_loop (x, 0xb4, 0) != 0x7ffe
> +      || or_loop (x, 0xb4, 1) != 0x7fff
> +      || or_loop (x, 0xb5, 0) != 0xfffe
> +      || or_loop (x, 0xb5, 1) != 0xffff
> +      || eor_loop (x, 0, 0x3e) != 0x3e
> +      || eor_loop (x, 11, 0) != 0xe8
> +      || eor_loop (x, 11, 0x1ff) != 0x117
> +      || eor_loop (x, 0x100, 0) != 0xcf00
> +      || eor_loop (x, 0x100, 0xeee) != 0xc1ee
> +      || eor_loop (x, 0xfff, 0) != 0xa000
> +      || eor_loop (x, 0xfff, 0x8888) != 0x2888)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x, 0, 10000) != 10000
> +      || min_loop (x, 11, 65404) != 65403
> +      || min_loop (x, 11, 65402) != 65402
> +      || min_loop (x, 0x100, 256) != 255
> +      || min_loop (x, 0x100, 254) != 254
> +      || min_loop (x, 0xfff, 32) != 31
> +      || min_loop (x, 0xfff, 30) != 30
> +      || and_loop (x, 0, 0x1234) != 0x1234
> +      || and_loop (x, 11, 0xffff) != 0xff01
> +      || and_loop (x, 11, 0xcdef) != 0xcd01
> +      || and_loop (x, 0x80, 0xffff) != 0x8001
> +      || and_loop (x, 0x80, 0xfffe) != 0x8000
> +      || and_loop (x, 0xb4, 0xffff) != 0x8001
> +      || and_loop (x, 0xb4, 0xfffe) != 0x8000
> +      || and_loop (x, 0xb5, 0xffff) != 1
> +      || and_loop (x, 0xb5, 0xfffe) != 0)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
> new file mode 100644
> index 00000000000..ce2b8f2fcdc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
> @@ -0,0 +1,101 @@
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +void __attribute__((noipa))
> +add_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 += x[i * 2];
> +      res1 += x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +min_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +max_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +and_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 &= x[i * 2];
> +      res1 &= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +or_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 |= x[i * 2];
> +      res1 |= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +eor_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 ^= x[i * 2];
> +      res1 ^= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
> new file mode 100644
> index 00000000000..5514d8d6b3b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
> @@ -0,0 +1,61 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_13.c"
> +
> +int
> +main (void)
> +{
> +  unsigned int x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
> +
> +  unsigned int add_res[2] = { 42, 1111 };
> +  add_loop (x, add_res);
> +  if (add_res[0] != 968538154
> +      || add_res[1] != 964340823)
> +    __builtin_abort ();
> +
> +  unsigned int max_res1[2] = { 0, 0 };
> +  max_loop (x, max_res1);
> +  if (max_res1[0] != 1048150
> +      || max_res1[1] != 1045506)
> +    __builtin_abort ();
> +
> +  unsigned int max_res2[2] = { 1048151, 1045507 };
> +  max_loop (x, max_res2);
> +  if (max_res2[0] != 1048151
> +      || max_res2[1] != 1045507)
> +    __builtin_abort ();
> +
> +  unsigned int or_res[2] = { 0x1000000, 0x2000000 };
> +  or_loop (x, or_res);
> +  if (or_res[0] != 0x10ffffe
> +      || or_res[1] != 0x20ffffe)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res[2] = { 0x1000000, 0x2000000 };
> +  eor_loop (x, eor_res);
> +  if (eor_res[0] != 0x1010000
> +      || eor_res[1] != 0x20b5000)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i] & 0xfffff;
> +
> +  unsigned int min_res1[2] = { 500, 4000 };
> +  min_loop (x, min_res1);
> +  if (min_res1[0] != 425
> +      || min_res1[1] != 3069)
> +    __builtin_abort ();
> +
> +  unsigned int min_res2[2] = { 424, 3068 };
> +  min_loop (x, min_res2);
> +  if (min_res2[0] != 424
> +      || min_res2[1] != 3068)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
> new file mode 100644
> index 00000000000..3be611e4b37
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
> @@ -0,0 +1,107 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +void __attribute__((noipa))
> +add_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 += x[i * 2];
> +      res1 += x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +min_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +max_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +and_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 &= x[i * 2];
> +      res1 &= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +or_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 |= x[i * 2];
> +      res1 |= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +eor_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 ^= x[i * 2];
> +      res1 ^= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
> new file mode 100644
> index 00000000000..ccaa770e9b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
> @@ -0,0 +1,187 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_14.c"
> +
> +int
> +main (void)
> +{
> +  unsigned int x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
> +
> +  unsigned int add_res1[2] = { 11, 22 };
> +  add_loop (x, 0, add_res1);
> +  if (add_res1[0] != 11
> +      || add_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int add_res2[2] = { 10, 20 };
> +  add_loop (x, 11, add_res2);
> +  if (add_res2[0] != 1902
> +      || add_res2[1] != 2176)
> +    __builtin_abort ();
> +
> +  unsigned int add_res3[2] = { 15, 30 };
> +  add_loop (x, 0x100, add_res3);
> +  if (add_res3[0] != 22435087
> +      || add_res3[1] != 22566686)
> +    __builtin_abort ();
> +
> +  unsigned int add_res4[2] = { 100, 200 };
> +  add_loop (x, 0x11f, add_res4);
> +  if (add_res4[0] != 31602244
> +      || add_res4[1] != 31767656)
> +    __builtin_abort ();
> +
> +  unsigned int max_res1[2] = { 461, 500 };
> +  max_loop (x, 11, max_res1);
> +  if (max_res1[0] != 462
> +      || max_res1[1] != 506)
> +    __builtin_abort ();
> +
> +  unsigned int max_res2[2] = { 463, 507 };
> +  max_loop (x, 11, max_res2);
> +  if (max_res2[0] != 463
> +      || max_res2[1] != 507)
> +    __builtin_abort ();
> +
> +  unsigned int max_res3[2] = { 1000000, 1000000 };
> +  max_loop (x, 0x200, max_res3);
> +  if (max_res3[0] != 1047552
> +      || max_res3[1] != 1045506)
> +    __builtin_abort ();
> +
> +  unsigned int max_res4[2] = { 1047553, 1045507 };
> +  max_loop (x, 0x200, max_res4);
> +  if (max_res4[0] != 1047553
> +      || max_res4[1] != 1045507)
> +    __builtin_abort ();
> +
> +  unsigned int max_res5[2] = { 300000, 30000 };
> +  max_loop (x, 0x11f, max_res5);
> +  if (max_res5[0] != 328902
> +      || max_res5[1] != 330050)
> +    __builtin_abort ();
> +
> +  unsigned int max_res6[2] = { 328903, 330051 };
> +  max_loop (x, 0x11f, max_res6);
> +  if (max_res6[0] != 328903
> +      || max_res6[1] != 330051)
> +    __builtin_abort ();
> +
> +  unsigned int or_res1[2] = { 11, 22 };
> +  or_loop (x, 0, or_res1);
> +  if (or_res1[0] != 11
> +      || or_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int or_res2[2] = { 0x200000, 0xe00000 };
> +  or_loop (x, 11, or_res2);
> +  if (or_res2[0] != 0x2001fe
> +      || or_res2[1] != 0xe001fe)
> +    __builtin_abort ();
> +
> +  unsigned int or_res3[2] = { 0x800000, 0x700000 };
> +  or_loop (x, 0x40, or_res3);
> +  if (or_res3[0] != 0x803ffe
> +      || or_res3[1] != 0x707ffe)
> +    __builtin_abort ();
> +
> +  unsigned int or_res4[2] = { 0x100001, 0x300000 };
> +  or_loop (x, 0x4f, or_res4);
> +  if (or_res4[0] != 0x107fff
> +      || or_res4[1] != 0x307ffe)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res1[2] = { 11, 22 };
> +  eor_loop (x, 0, eor_res1);
> +  if (eor_res1[0] != 11
> +      || eor_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff };
> +  eor_loop (x, 11, eor_res2);
> +  if (eor_res2[0] != 0x2001cf
> +      || eor_res2[1] != 0xe000b7)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res3[2] = { 0x805000, 0x70f000 };
> +  eor_loop (x, 0x100, eor_res3);
> +  if (eor_res3[0] != 0x824200
> +      || eor_res3[1] != 0x77dc00)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res4[2] = { 0x101201, 0x300f00 };
> +  eor_loop (x, 0x11f, eor_res4);
> +  if (eor_res4[0] != 0x178801
> +      || eor_res4[1] != 0x337240)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i] & 0xfffff;
> +
> +  unsigned int min_res1[2] = { 1048200, 1048100 };
> +  min_loop (x, 11, min_res1);
> +  if (min_res1[0] != 1048113
> +      || min_res1[1] != 1048069)
> +    __builtin_abort ();
> +
> +  unsigned int min_res2[2] = { 1048112, 1048068 };
> +  min_loop (x, 11, min_res2);
> +  if (min_res2[0] != 1048112
> +      || min_res2[1] != 1048068)
> +    __builtin_abort ();
> +
> +  unsigned int min_res3[2] = { 10000, 10000 };
> +  min_loop (x, 0x200, min_res3);
> +  if (min_res3[0] != 1023
> +      || min_res3[1] != 3069)
> +    __builtin_abort ();
> +
> +  unsigned int min_res4[2] = { 1022, 3068 };
> +  min_loop (x, 0x200, min_res4);
> +  if (min_res4[0] != 1022
> +      || min_res4[1] != 3068)
> +    __builtin_abort ();
> +
> +  unsigned int min_res5[2] = { 719680, 718530 };
> +  min_loop (x, 0x11f, min_res5);
> +  if (min_res5[0] != 719673
> +      || min_res5[1] != 718525)
> +    __builtin_abort ();
> +
> +  unsigned int min_res6[2] = { 719672, 718524 };
> +  min_loop (x, 0x11f, min_res6);
> +  if (min_res6[0] != 719672
> +      || min_res6[1] != 718524)
> +    __builtin_abort ();
> +
> +  unsigned int and_res1[2] = { 11, 22 };
> +  and_loop (x, 0, and_res1);
> +  if (and_res1[0] != 11
> +      || and_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int and_res2[2] = { 0xf5cff, 0xf78ff };
> +  and_loop (x, 11, and_res2);
> +  if (and_res2[0] != 0xf5c01
> +      || and_res2[1] != 0xf7801)
> +    __builtin_abort ();
> +
> +  unsigned int and_res3[2] = { 0x7efff, 0xecfff };
> +  and_loop (x, 0x40, and_res3);
> +  if (and_res3[0] != 0x7c001
> +      || and_res3[1] != 0xe8001)
> +    __builtin_abort ();
> +
> +  unsigned int and_res4[2] = { 0xffffff, 0xffffff };
> +  and_loop (x, 0x4f, and_res4);
> +  if (and_res4[0] != 0xf8001
> +      || and_res4[1] != 0xf8001)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
> new file mode 100644
> index 00000000000..15b1ade30e2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
> @@ -0,0 +1,16 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +int __attribute__((noipa))
> +add_loop (int *x, int n, int res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res += x[i * 2];
> +      res += x[i * 2 + 1];
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
> new file mode 100644
> index 00000000000..3207fce5be3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
> @@ -0,0 +1,22 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_15.c"
> +
> +int
> +main (void)
> +{
> +  int x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
> +
> +  if (add_loop (x, 0, 33) != 33
> +      || add_loop (x, 11, 30) != 4078
> +      || add_loop (x, 0x100, 45) != 45001773
> +      || add_loop (x, 0x11f, 300) != 63369900)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
> new file mode 100644
> index 00000000000..b839821d6bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
> @@ -0,0 +1,77 @@
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
> new file mode 100644
> index 00000000000..aa248f53eaa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
> @@ -0,0 +1,29 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_9.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x) != 20480
> +      || max_loop (x) != 65504
> +      || or_loop (x) != 0xfffe
> +      || eor_loop (x) != 0xa000)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x) != 31
> +      || and_loop (x) != 1)
> +    __builtin_abort ();
> +
> +  return 0;
> +}

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 10/10] vect: Reuse reduction accumulators between loops
  2021-07-09 11:58   ` Richard Biener
@ 2021-07-09 13:12     ` Richard Sandiford
  2021-07-12  6:32       ` Richard Biener
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-09 13:12 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Thanks for the review.

Richard Biener <richard.guenther@gmail.com> writes:
>> @@ -588,6 +600,23 @@ public:
>>    /* Unrolling factor  */
>>    poly_uint64 vectorization_factor;
>>
>> +  /* If this loop is an epilogue loop whose main loop can be skipped,
>> +     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
>> +     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
>> +     main loop and goes straight to this loop's preheader.
>> +
>> +     Both fields are null otherwise.  */
>> +  edge main_loop_edge;
>> +  edge skip_main_loop_edge;
>> +
>> +  /* If this loop is an epilogue loop that might be skipped after executing
>> +     the main loop, this edge is the one that skips the epilogue.  */
>> +  edge skip_this_loop_edge;
>> +
>> +  /* After vectorization, maps live-out SSA names to information about
>> +     the reductions that generated them.  */
>> +  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
>
> Is that the LC PHI node defs or the definition inside of the loop?
> If the latter we could attach the info directly to its stmt-info?

Ah, yeah, I should improve the comment there.  It's the vectoriser's
replacement for the original LC PHI node, i.e. the final scalar result
after the reduction has taken place.

>> @@ -1186,6 +1215,21 @@ public:
>>    /* The vector type for performing the actual reduction.  */
>>    tree reduc_vectype;
>>
>> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
>> +     elements in parallel, this vector gives the initial values of these
>> +     N elements.  */
>
> That's N scalar elements or N vector elements?  I suppose it's for
> SLP reductions (rather than SLP reduction chains) and never non-SLP
> reductions?

Yeah, poor wording again, sorry.  I meant something closer to:

  /* If IS_REDUC_INFO is true and if the vector code is performing
     N scalar reductions in parallel, this vector gives the initial
     scalar values of those N reductions.  */

>> +  vec<tree> reduc_initial_values;
>> +
>> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
>> +     elements in parallel, this vector gives the scalar result of each
>> +     reduction.  */
>> +  vec<tree> reduc_scalar_results;

Same change here.

>> […]
>> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
>> index 2909e8a0fc3..b7b0523e3c8 100644
>> --- a/gcc/tree-vect-loop-manip.c
>> +++ b/gcc/tree-vect-loop-manip.c
>> @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
>>    return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
>>  }
>>
>> +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
>> +   from the corresponding main loop.  Return a value that is available in
>> +   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
>> +   Passing a null SKIP_VALUE is equivalent to passing zero.  */
>> +
>> +tree
>> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
>> +                          tree skip_value)
>> +{
>> +  if (!loop_vinfo->main_loop_edge)
>> +    return main_loop_value;
>> +
>> +  if (!skip_value)
>> +    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
>
> shouldn't that be the initial value?

For the current use case, the above two conditions are never true.
I wrote it like this because I had a follow-on patch (which might
not go anywhere) that needed this function for 0-based IVs.

Maybe that's a bad risk/reward trade-off though.  Not having to pass
zero makes things only slightly simpler for the follow-on patch,
and I guess could be dangerous in other cases.

Perhaps in that case though I should change loop_vinfo->main_loop_edge
into a gcc_assert as well.

>> +  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
>> +  basic_block bb = loop_vinfo->main_loop_edge->dest;
>> +  gphi *new_phi = create_phi_node (phi_result, bb);
>> +  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
>> +              UNKNOWN_LOCATION);
>> +  add_phi_arg (new_phi, skip_value,
>> +              loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
>> +  return phi_result;
>> +}
>> +
>>  /* Function vect_do_peeling.
>>
>>     Input:
>> […]
>> @@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
>>    return stmt_info;
>>  }
>>
>> +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
>> +   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
>> +   matching reduction that we can build on.  Adjust REDUC_INFO and return true
>> +   if so, otherwise return false.  */
>> +
>> +static bool
>> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
>> +                               stmt_vec_info reduc_info)
>> +{
>> +  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>> +  if (!main_loop_vinfo)
>> +    return false;
>> +
>> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
>> +    return false;
>> +
>> +  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
>> +  auto_vec<tree, 16> main_loop_results (num_phis);
>> +  auto_vec<tree, 16> initial_values (num_phis);
>> +  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
>> +    {
>> +      /* The epilogue loop can be entered either from the main loop or
>> +        from an earlier guard block.  */
>> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
>> +      for (tree incoming_value : reduc_info->reduc_initial_values)
>> +       {
>> +         /* Look for:
>> +
>> +              INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
>> +                                   INITIAL_VALUE(guard block)>.  */
>> +         gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
>> +
>> +         gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
>> +         gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
>> +
>> +         tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
>> +         tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
>> +
>> +         main_loop_results.quick_push (from_main_loop);
>> +         initial_values.quick_push (from_skip);
>> +       }
>> +    }
>> +  else
>> +    /* The main loop dominates the epilogue loop.  */
>> +    main_loop_results.splice (reduc_info->reduc_initial_values);
>> +
>> +  /* See if the main loop has the kind of accumulator we need.  */
>> +  vect_reusable_accumulator *accumulator
>> +    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
>> +  if (!accumulator
>> +      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
>> +      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
>> +                     accumulator->reduc_info->reduc_scalar_results.begin ()))
>> +    return false;
>> +
>> +  /* For now, only handle the case in which both loops are operating on the
>> +     same vector types.  In future we could reduce wider vectors to narrower
>> +     ones as well.  */
>> +  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
>> +  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
>> +  if (!useless_type_conversion_p (old_vectype, vectype))
>
> It should be indeed quite trivial to handle, likewise the case where we
> have multiple PHIs - just reduce to a single input vector and have the
> possibly multiple input vectors in the epilogue filled with neutral
> elements.  I'll see if I can cook up stuff for this next week.

Yeah, agreed.  The multi-vector epilogue case should be especially easy
to handle, but it's not interesting for SVE as things stand, since:

(a) non-SLP reductions use a single cycle for ncopies>1 (a misfeature
    IMO -- on targets with wide pipelines we want exactly the opposite)

(b) SLP reductions are limited to single vectors for variable-length targets.

So it wasn't possible to trigger multiple epilogue vectors for the
motivating SVE use case.

>> […]
>> @@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>>        reduc_inputs.safe_push (single_input);
>>      }
>>
>> +  tree orig_reduc_input = reduc_inputs[0];
>> +
>> +  /* If this loop is an epilogue loop that can be skipped after the
>> +     main loop, we can only share a reduction operation between the
>> +     main loop and the epilogue if we put it at the target of the
>> +     skip edge.
>
> Do you have a testcase where we cannot do this?

No, it's being defensive.  I wasn't sure how the epilogue code would
evolve in future.

>> +     We can still reuse accumulators if this check fails.  Doing so has
>> +     the minor(?) benefit of making the epilogue loop's scalar result
>> +     independent of the main loop's scalar result.  */
>> +  bool unify_with_main_loop_p = false;
>> +  if (reduc_info->reused_accumulator
>> +      && loop_vinfo->skip_this_loop_edge
>> +      && single_succ_p (exit_bb)
>> +      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
>> +    {
>> +      unify_with_main_loop_p = true;
>> +
>> +      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
>> +      reduc_inputs[0] = make_ssa_name (vectype);
>> +      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
>> +      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
>> +                  UNKNOWN_LOCATION);
>> +      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
>> +                  loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
>> +      exit_gsi = gsi_after_labels (reduc_block);
>> +    }
>> +
>> +  /* Shouldn't be used beyond this point.  */
>> +  exit_bb = nullptr;
>> +
>>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
>>        && reduc_fn != IFN_LAST)
>>      {
>> @@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>>        scalar_results[0] = new_temp;
>>      }
>>
>> +  /* Record this operation if it could be reused by the epilogue loop.  */
>> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
>> +      && !double_reduc)
>
> what's the issue with double_reduc?

Probably nothing TBH.  I haven't been able to construct a case that
uses predicated double reductions with vect-partial-vector-usage=1,
but that's probably a missed optimisation.

There again, double reductions themselves seem to be hard to trigger
now that we have loop interchange.  Is there a good way of testing
them without -fno-loop-interchange?

Thanks,
Richard

>> +    loop_vinfo->reusable_accumulators.put (scalar_results[0],
>> +                                          { orig_reduc_input, reduc_info });
>> +
>>    if (double_reduc)
>>      loop = outer_loop;
>>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 10/10] vect: Reuse reduction accumulators between loops
  2021-07-09 13:12     ` Richard Sandiford
@ 2021-07-12  6:32       ` Richard Biener
  2021-07-12 17:55         ` Richard Sandiford
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Biener @ 2021-07-12  6:32 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, Richard Sandiford

On Fri, Jul 9, 2021 at 3:12 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Thanks for the review.
>
> Richard Biener <richard.guenther@gmail.com> writes:
> >> @@ -588,6 +600,23 @@ public:
> >>    /* Unrolling factor  */
> >>    poly_uint64 vectorization_factor;
> >>
> >> +  /* If this loop is an epilogue loop whose main loop can be skipped,
> >> +     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
> >> +     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
> >> +     main loop and goes straight to this loop's preheader.
> >> +
> >> +     Both fields are null otherwise.  */
> >> +  edge main_loop_edge;
> >> +  edge skip_main_loop_edge;
> >> +
> >> +  /* If this loop is an epilogue loop that might be skipped after executing
> >> +     the main loop, this edge is the one that skips the epilogue.  */
> >> +  edge skip_this_loop_edge;
> >> +
> >> +  /* After vectorization, maps live-out SSA names to information about
> >> +     the reductions that generated them.  */
> >> +  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
> >
> > Is that the LC PHI node defs or the definition inside of the loop?
> > If the latter we could attach the info directly to its stmt-info?
>
> Ah, yeah, I should improve the comment there.  It's the vectoriser's
> replacement for the original LC PHI node, i.e. the final scalar result
> after the reduction has taken place.

OK

> >> @@ -1186,6 +1215,21 @@ public:
> >>    /* The vector type for performing the actual reduction.  */
> >>    tree reduc_vectype;
> >>
> >> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
> >> +     elements in parallel, this vector gives the initial values of these
> >> +     N elements.  */
> >
> > That's N scalar elements or N vector elements?  I suppose it's for
> > SLP reductions (rather than SLP reduction chains) and never non-SLP
> > reductions?
>
> Yeah, poor wording again, sorry.  I meant something closer to:
>
>   /* If IS_REDUC_INFO is true and if the vector code is performing
>      N scalar reductions in parallel, this vector gives the initial
>      scalar values of those N reductions.  */
>
> >> +  vec<tree> reduc_initial_values;
> >> +
> >> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
> >> +     elements in parallel, this vector gives the scalar result of each
> >> +     reduction.  */
> >> +  vec<tree> reduc_scalar_results;
>
> Same change here.
>
> >> […]
> >> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> >> index 2909e8a0fc3..b7b0523e3c8 100644
> >> --- a/gcc/tree-vect-loop-manip.c
> >> +++ b/gcc/tree-vect-loop-manip.c
> >> @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
> >>    return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
> >>  }
> >>
> >> +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
> >> +   from the corresponding main loop.  Return a value that is available in
> >> +   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
> >> +   Passing a null SKIP_VALUE is equivalent to passing zero.  */
> >> +
> >> +tree
> >> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
> >> +                          tree skip_value)
> >> +{
> >> +  if (!loop_vinfo->main_loop_edge)
> >> +    return main_loop_value;
> >> +
> >> +  if (!skip_value)
> >> +    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
> >
> > shouldn't that be the initial value?
>
> For the current use case, the above two conditions are never true.
> I wrote it like this because I had a follow-on patch (which might
> not go anywhere) that needed this function for 0-based IVs.
>
> Maybe that's a bad risk/reward trade-off though.  Not having to pass
> zero makes things only slightly simpler for the follow-on patch,
> and I guess could be dangerous in other cases.
>
> Perhaps in that case though I should change loop_vinfo->main_loop_edge
> into a gcc_assert as well.

Yeah, I think asserts (and comments in case it's because we don't handle
some specific cases yet) are better than possibly wrong behavior.

> >> +  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
> >> +  basic_block bb = loop_vinfo->main_loop_edge->dest;
> >> +  gphi *new_phi = create_phi_node (phi_result, bb);
> >> +  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
> >> +              UNKNOWN_LOCATION);
> >> +  add_phi_arg (new_phi, skip_value,
> >> +              loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
> >> +  return phi_result;
> >> +}
> >> +
> >>  /* Function vect_do_peeling.
> >>
> >>     Input:
> >> […]
> >> @@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
> >>    return stmt_info;
> >>  }
> >>
> >> +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
> >> +   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
> >> +   matching reduction that we can build on.  Adjust REDUC_INFO and return true
> >> +   if so, otherwise return false.  */
> >> +
> >> +static bool
> >> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
> >> +                               stmt_vec_info reduc_info)
> >> +{
> >> +  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
> >> +  if (!main_loop_vinfo)
> >> +    return false;
> >> +
> >> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
> >> +    return false;
> >> +
> >> +  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
> >> +  auto_vec<tree, 16> main_loop_results (num_phis);
> >> +  auto_vec<tree, 16> initial_values (num_phis);
> >> +  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
> >> +    {
> >> +      /* The epilogue loop can be entered either from the main loop or
> >> +        from an earlier guard block.  */
> >> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
> >> +      for (tree incoming_value : reduc_info->reduc_initial_values)
> >> +       {
> >> +         /* Look for:
> >> +
> >> +              INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
> >> +                                   INITIAL_VALUE(guard block)>.  */
> >> +         gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
> >> +
> >> +         gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
> >> +         gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
> >> +
> >> +         tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
> >> +         tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
> >> +
> >> +         main_loop_results.quick_push (from_main_loop);
> >> +         initial_values.quick_push (from_skip);
> >> +       }
> >> +    }
> >> +  else
> >> +    /* The main loop dominates the epilogue loop.  */
> >> +    main_loop_results.splice (reduc_info->reduc_initial_values);
> >> +
> >> +  /* See if the main loop has the kind of accumulator we need.  */
> >> +  vect_reusable_accumulator *accumulator
> >> +    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
> >> +  if (!accumulator
> >> +      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
> >> +      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
> >> +                     accumulator->reduc_info->reduc_scalar_results.begin ()))
> >> +    return false;
> >> +
> >> +  /* For now, only handle the case in which both loops are operating on the
> >> +     same vector types.  In future we could reduce wider vectors to narrower
> >> +     ones as well.  */
> >> +  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
> >> +  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
> >> +  if (!useless_type_conversion_p (old_vectype, vectype))
> >
> > It should be indeed quite trivial to handle, likewise the case where we
> > have multiple PHIs - just reduce to a single input vector and have the
> > possibly multiple input vectors in the epilogue filled with neutral
> > elements.  I'll see if I can cook up stuff for this next week.
>
> Yeah, agreed.  The multi-vector epilogue case should be especially easy
> to handle, but it's not interesting for SVE as things stand, since:
>
> (a) non-SLP reductions use a single cycle for ncopies>1 (a misfeature
>     IMO -- on targets with wide pipelines we want exactly the opposite)
>
> (b) SLP reductions are limited to single vectors for variable-length targets.
>
> So it wasn't possible to trigger multiple epilogue vectors for the
> motivating SVE use case.

OK, I see.  If the series is in I'll see to create testcases for x86_64.

> >> […]
> >> @@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
> >>        reduc_inputs.safe_push (single_input);
> >>      }
> >>
> >> +  tree orig_reduc_input = reduc_inputs[0];
> >> +
> >> +  /* If this loop is an epilogue loop that can be skipped after the
> >> +     main loop, we can only share a reduction operation between the
> >> +     main loop and the epilogue if we put it at the target of the
> >> +     skip edge.
> >
> > Do you have a testcase where we cannot do this?
>
> No, it's being defensive.  I wasn't sure how the epilogue code would
> evolve in future.
>
> >> +     We can still reuse accumulators if this check fails.  Doing so has
> >> +     the minor(?) benefit of making the epilogue loop's scalar result
> >> +     independent of the main loop's scalar result.  */
> >> +  bool unify_with_main_loop_p = false;
> >> +  if (reduc_info->reused_accumulator
> >> +      && loop_vinfo->skip_this_loop_edge
> >> +      && single_succ_p (exit_bb)
> >> +      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
> >> +    {
> >> +      unify_with_main_loop_p = true;
> >> +
> >> +      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
> >> +      reduc_inputs[0] = make_ssa_name (vectype);
> >> +      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
> >> +      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
> >> +                  UNKNOWN_LOCATION);
> >> +      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
> >> +                  loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
> >> +      exit_gsi = gsi_after_labels (reduc_block);
> >> +    }
> >> +
> >> +  /* Shouldn't be used beyond this point.  */
> >> +  exit_bb = nullptr;
> >> +
> >>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
> >>        && reduc_fn != IFN_LAST)
> >>      {
> >> @@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
> >>        scalar_results[0] = new_temp;
> >>      }
> >>
> >> +  /* Record this operation if it could be reused by the epilogue loop.  */
> >> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
> >> +      && !double_reduc)
> >
> > what's the issue with double_reduc?
>
> Probably nothing TBH.  I haven't been able to construct a case that
> uses predicated double reductions with vect-partial-vector-usage=1,
> but that's probably a missed optimisation.
>
> There again, double reductions themselves seem to be hard to trigger
> now that we have loop interchange.  Is there a good way of testing
> them without -fno-loop-interchange?

there are a bunch of testcases in gcc.dg/vect/vect-double-reduc-?.c,
I don't see how interchange avoids the double reduction, in fact when
doing interchange we no longer can apply outer loop vectorization (but
it's still a double reduction, just only inner loop vectorized).
But eventually we don't do epilogue vectorization for outer loop
vectorizations with reductions.

Oh, and of course vect.exp runs with -O2 -ftree-vectorize, avoiding
any of the high-level loop opts ...

Richard.

> Thanks,
> Richard
>
> >> +    loop_vinfo->reusable_accumulators.put (scalar_results[0],
> >> +                                          { orig_reduc_input, reduc_info });
> >> +
> >>    if (double_reduc)
> >>      loop = outer_loop;
> >>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 10/10] vect: Reuse reduction accumulators between loops
  2021-07-12  6:32       ` Richard Biener
@ 2021-07-12 17:55         ` Richard Sandiford
  2021-07-13  6:09           ` Richard Biener
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Sandiford @ 2021-07-12 17:55 UTC (permalink / raw)
  To: Richard Biener via Gcc-patches

Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Fri, Jul 9, 2021 at 3:12 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Thanks for the review.
>>
>> Richard Biener <richard.guenther@gmail.com> writes:
>> >> @@ -588,6 +600,23 @@ public:
>> >>    /* Unrolling factor  */
>> >>    poly_uint64 vectorization_factor;
>> >>
>> >> +  /* If this loop is an epilogue loop whose main loop can be skipped,
>> >> +     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
>> >> +     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
>> >> +     main loop and goes straight to this loop's preheader.
>> >> +
>> >> +     Both fields are null otherwise.  */
>> >> +  edge main_loop_edge;
>> >> +  edge skip_main_loop_edge;
>> >> +
>> >> +  /* If this loop is an epilogue loop that might be skipped after executing
>> >> +     the main loop, this edge is the one that skips the epilogue.  */
>> >> +  edge skip_this_loop_edge;
>> >> +
>> >> +  /* After vectorization, maps live-out SSA names to information about
>> >> +     the reductions that generated them.  */
>> >> +  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
>> >
>> > Is that the LC PHI node defs or the definition inside of the loop?
>> > If the latter we could attach the info directly to its stmt-info?
>>
>> Ah, yeah, I should improve the comment there.  It's the vectoriser's
>> replacement for the original LC PHI node, i.e. the final scalar result
>> after the reduction has taken place.
>
> OK
>
>> >> @@ -1186,6 +1215,21 @@ public:
>> >>    /* The vector type for performing the actual reduction.  */
>> >>    tree reduc_vectype;
>> >>
>> >> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
>> >> +     elements in parallel, this vector gives the initial values of these
>> >> +     N elements.  */
>> >
>> > That's N scalar elements or N vector elements?  I suppose it's for
>> > SLP reductions (rather than SLP reduction chains) and never non-SLP
>> > reductions?
>>
>> Yeah, poor wording again, sorry.  I meant something closer to:
>>
>>   /* If IS_REDUC_INFO is true and if the vector code is performing
>>      N scalar reductions in parallel, this vector gives the initial
>>      scalar values of those N reductions.  */
>>
>> >> +  vec<tree> reduc_initial_values;
>> >> +
>> >> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
>> >> +     elements in parallel, this vector gives the scalar result of each
>> >> +     reduction.  */
>> >> +  vec<tree> reduc_scalar_results;
>>
>> Same change here.
>>
>> >> […]
>> >> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
>> >> index 2909e8a0fc3..b7b0523e3c8 100644
>> >> --- a/gcc/tree-vect-loop-manip.c
>> >> +++ b/gcc/tree-vect-loop-manip.c
>> >> @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
>> >>    return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
>> >>  }
>> >>
>> >> +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
>> >> +   from the corresponding main loop.  Return a value that is available in
>> >> +   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
>> >> +   Passing a null SKIP_VALUE is equivalent to passing zero.  */
>> >> +
>> >> +tree
>> >> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
>> >> +                          tree skip_value)
>> >> +{
>> >> +  if (!loop_vinfo->main_loop_edge)
>> >> +    return main_loop_value;
>> >> +
>> >> +  if (!skip_value)
>> >> +    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
>> >
>> > shouldn't that be the initial value?
>>
>> For the current use case, the above two conditions are never true.
>> I wrote it like this because I had a follow-on patch (which might
>> not go anywhere) that needed this function for 0-based IVs.
>>
>> Maybe that's a bad risk/reward trade-off though.  Not having to pass
>> zero makes things only slightly simpler for the follow-on patch,
>> and I guess could be dangerous in other cases.
>>
>> Perhaps in that case though I should change loop_vinfo->main_loop_edge
>> into a gcc_assert as well.
>
> Yeah, I think asserts (and comments in case it's because we don't handle
> some specific cases yet) are better than possibly wrong behavior.

OK.

>> >> +  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
>> >> +  basic_block bb = loop_vinfo->main_loop_edge->dest;
>> >> +  gphi *new_phi = create_phi_node (phi_result, bb);
>> >> +  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
>> >> +              UNKNOWN_LOCATION);
>> >> +  add_phi_arg (new_phi, skip_value,
>> >> +              loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
>> >> +  return phi_result;
>> >> +}
>> >> +
>> >>  /* Function vect_do_peeling.
>> >>
>> >>     Input:
>> >> […]
>> >> @@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
>> >>    return stmt_info;
>> >>  }
>> >>
>> >> +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
>> >> +   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
>> >> +   matching reduction that we can build on.  Adjust REDUC_INFO and return true
>> >> +   if so, otherwise return false.  */
>> >> +
>> >> +static bool
>> >> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
>> >> +                               stmt_vec_info reduc_info)
>> >> +{
>> >> +  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>> >> +  if (!main_loop_vinfo)
>> >> +    return false;
>> >> +
>> >> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
>> >> +    return false;
>> >> +
>> >> +  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
>> >> +  auto_vec<tree, 16> main_loop_results (num_phis);
>> >> +  auto_vec<tree, 16> initial_values (num_phis);
>> >> +  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
>> >> +    {
>> >> +      /* The epilogue loop can be entered either from the main loop or
>> >> +        from an earlier guard block.  */
>> >> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
>> >> +      for (tree incoming_value : reduc_info->reduc_initial_values)
>> >> +       {
>> >> +         /* Look for:
>> >> +
>> >> +              INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
>> >> +                                   INITIAL_VALUE(guard block)>.  */
>> >> +         gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
>> >> +
>> >> +         gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
>> >> +         gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
>> >> +
>> >> +         tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
>> >> +         tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
>> >> +
>> >> +         main_loop_results.quick_push (from_main_loop);
>> >> +         initial_values.quick_push (from_skip);
>> >> +       }
>> >> +    }
>> >> +  else
>> >> +    /* The main loop dominates the epilogue loop.  */
>> >> +    main_loop_results.splice (reduc_info->reduc_initial_values);
>> >> +
>> >> +  /* See if the main loop has the kind of accumulator we need.  */
>> >> +  vect_reusable_accumulator *accumulator
>> >> +    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
>> >> +  if (!accumulator
>> >> +      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
>> >> +      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
>> >> +                     accumulator->reduc_info->reduc_scalar_results.begin ()))
>> >> +    return false;
>> >> +
>> >> +  /* For now, only handle the case in which both loops are operating on the
>> >> +     same vector types.  In future we could reduce wider vectors to narrower
>> >> +     ones as well.  */
>> >> +  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
>> >> +  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
>> >> +  if (!useless_type_conversion_p (old_vectype, vectype))
>> >
>> > It should be indeed quite trivial to handle, likewise the case where we
>> > have multiple PHIs - just reduce to a single input vector and have the
>> > possibly multiple input vectors in the epilogue filled with neutral
>> > elements.  I'll see if I can cook up stuff for this next week.
>>
>> Yeah, agreed.  The multi-vector epilogue case should be especially easy
>> to handle, but it's not interesting for SVE as things stand, since:
>>
>> (a) non-SLP reductions use a single cycle for ncopies>1 (a misfeature
>>     IMO -- on targets with wide pipelines we want exactly the opposite)
>>
>> (b) SLP reductions are limited to single vectors for variable-length targets.
>>
>> So it wasn't possible to trigger multiple epilogue vectors for the
>> motivating SVE use case.
>
> OK, I see.  If the series is in I'll see to create testcases for x86_64.
>
>> >> […]
>> >> @@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>> >>        reduc_inputs.safe_push (single_input);
>> >>      }
>> >>
>> >> +  tree orig_reduc_input = reduc_inputs[0];
>> >> +
>> >> +  /* If this loop is an epilogue loop that can be skipped after the
>> >> +     main loop, we can only share a reduction operation between the
>> >> +     main loop and the epilogue if we put it at the target of the
>> >> +     skip edge.
>> >
>> > Do you have a testcase where we cannot do this?
>>
>> No, it's being defensive.  I wasn't sure how the epilogue code would
>> evolve in future.
>>
>> >> +     We can still reuse accumulators if this check fails.  Doing so has
>> >> +     the minor(?) benefit of making the epilogue loop's scalar result
>> >> +     independent of the main loop's scalar result.  */
>> >> +  bool unify_with_main_loop_p = false;
>> >> +  if (reduc_info->reused_accumulator
>> >> +      && loop_vinfo->skip_this_loop_edge
>> >> +      && single_succ_p (exit_bb)
>> >> +      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
>> >> +    {
>> >> +      unify_with_main_loop_p = true;
>> >> +
>> >> +      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
>> >> +      reduc_inputs[0] = make_ssa_name (vectype);
>> >> +      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
>> >> +      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
>> >> +                  UNKNOWN_LOCATION);
>> >> +      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
>> >> +                  loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
>> >> +      exit_gsi = gsi_after_labels (reduc_block);
>> >> +    }
>> >> +
>> >> +  /* Shouldn't be used beyond this point.  */
>> >> +  exit_bb = nullptr;
>> >> +
>> >>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
>> >>        && reduc_fn != IFN_LAST)
>> >>      {
>> >> @@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>> >>        scalar_results[0] = new_temp;
>> >>      }
>> >>
>> >> +  /* Record this operation if it could be reused by the epilogue loop.  */
>> >> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
>> >> +      && !double_reduc)
>> >
>> > what's the issue with double_reduc?
>>
>> Probably nothing TBH.  I haven't been able to construct a case that
>> uses predicated double reductions with vect-partial-vector-usage=1,
>> but that's probably a missed optimisation.
>>
>> There again, double reductions themselves seem to be hard to trigger
>> now that we have loop interchange.  Is there a good way of testing
>> them without -fno-loop-interchange?
>
> there are a bunch of testcases in gcc.dg/vect/vect-double-reduc-?.c,
> I don't see how interchange avoids the double reduction, in fact when
> doing interchange we no longer can apply outer loop vectorization (but
> it's still a double reduction, just only inner loop vectorized).
> But eventually we don't do epilogue vectorization for outer loop
> vectorizations with reductions.

Well, to take the vect-double-reduc-1.c loop:

  for (k = 0; k < K; k++)
    {
      sum = 0;
      for (j = 0; j < K; j++) 
        for (i = 0; i < K; i++) 
          sum += in[i+k][j] * coeff[i][j];
 
      out[k] = sum;
    }

loop interchange converts this to:

  for (k = 0; k < K; k++)
    {
      sum = 0;
      for (i = 0; i < K; i++) 
        for (j = 0; j < K; j++) 
          sum += in[i+k][j] * coeff[i][j];
 
      out[k] = sum;
    }

and then we vectorise the inner loop.

I think in principle that's the right thing to do, since the double
reduction is more like:

      for (j1 = 0; j1 < K; j1 += vsize)
        for (i = 0; i < K; i++) 
          for (j = j1; j < MIN (K, j1 + vsize); j++)
            sum += in[i+k][j] * coeff[i][j];

which isn't as nice an access pattern.

It would be good if we could sink the vector->scalar reduction
in the interchanged form, but that feels like a separate optimisation,
and could potentially happen regardless of whether we can vectorise
any other code in the outer loop.  E.g. it could happen for:

      for (i = 0; i < K; i++) 
        {
          printf ("Hello, world!\n");
          for (j = 0; j < K; j++) 
            sum += in[i+k][j] * coeff[i][j];
        }

> Oh, and of course vect.exp runs with -O2 -ftree-vectorize, avoiding
> any of the high-level loop opts ...

Ah, yeah, of course.  I was trying to use the vect-double-reduc*.c
tests with -O3…

How does this version look?  Changes from v1:

- Fix comments in new tree-vectorizer.h fields.
- Fix an out-of-date comment above vect_find_reusable_accumulator
- Remove !double_reduc condition.
- Make vect_get_main_loop_result specific to the case in which a phi
  node is needed.

Tested as above.

Thanks,
Richard

gcc/
	* tree-vectorizer.h (vect_reusable_accumulator): New structure.
	(_loop_vec_info::main_loop_edge): New field.
	(_loop_vec_info::skip_main_loop_edge): Likewise.
	(_loop_vec_info::skip_this_loop_edge): Likewise.
	(_loop_vec_info::reusable_accumulators): Likewise.
	(_stmt_vec_info::reduc_scalar_results): Likewise.
	(_stmt_vec_info::reused_accumulator): Likewise.
	(vect_get_main_loop_result): Declare.
	* tree-vectorizer.c (vec_info::new_stmt_vec_info): Initialize
	reduc_scalar_inputs.
	(vec_info::free_stmt_vec_info): Free reduc_scalar_inputs.
	* tree-vect-loop-manip.c (vect_get_main_loop_result): New function.
	(vect_do_peeling): Fill an epilogue loop's main_loop_edge,
	skip_main_loop_edge and skip_this_loop_edge fields.
	* tree-vect-loop.c (INCLUDE_ALGORITHM): Define.
	(vect_emit_reduction_init_stmts): New function.
	(get_initial_def_for_reduction): Use it.
	(get_initial_defs_for_reduction): Likewise.  Change the vinfo
	parameter to a loop_vec_info.
	(vect_create_epilog_for_reduction): Store the scalar results
	in the reduc_info.  If an epilogue loop is reusing an accumulator
	from the main loop, and if the epilogue loop can also be skipped,
	try to place the reduction code in the join block.  Record
	accumulators that could potentially be reused by epilogue loops.
	(vect_transform_cycle_phi): When vectorizing epilogue loops,
	try to reuse accumulators from the main loop.  Record the initial
	value in reduc_info for non-SLP reductions too.

gcc/testsuite/
	* gcc.target/aarch64/sve/reduc_9.c: New test.
	* gcc.target/aarch64/sve/reduc_9_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_10.c: Likewise.
	* gcc.target/aarch64/sve/reduc_10_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_11.c: Likewise.
	* gcc.target/aarch64/sve/reduc_11_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_12.c: Likewise.
	* gcc.target/aarch64/sve/reduc_12_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_13.c: Likewise.
	* gcc.target/aarch64/sve/reduc_13_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_14.c: Likewise.
	* gcc.target/aarch64/sve/reduc_14_run.c: Likewise.
	* gcc.target/aarch64/sve/reduc_15.c: Likewise.
	* gcc.target/aarch64/sve/reduc_15_run.c: Likewise.
---
 .../gcc.target/aarch64/sve/reduc_10.c         |  77 +++++
 .../gcc.target/aarch64/sve/reduc_10_run.c     |  49 +++
 .../gcc.target/aarch64/sve/reduc_11.c         |  71 ++++
 .../gcc.target/aarch64/sve/reduc_11_run.c     |  34 ++
 .../gcc.target/aarch64/sve/reduc_12.c         |  71 ++++
 .../gcc.target/aarch64/sve/reduc_12_run.c     |  66 ++++
 .../gcc.target/aarch64/sve/reduc_13.c         | 101 ++++++
 .../gcc.target/aarch64/sve/reduc_13_run.c     |  61 ++++
 .../gcc.target/aarch64/sve/reduc_14.c         | 107 ++++++
 .../gcc.target/aarch64/sve/reduc_14_run.c     | 187 +++++++++++
 .../gcc.target/aarch64/sve/reduc_15.c         |  16 +
 .../gcc.target/aarch64/sve/reduc_15_run.c     |  22 ++
 .../gcc.target/aarch64/sve/reduc_9.c          |  77 +++++
 .../gcc.target/aarch64/sve/reduc_9_run.c      |  29 ++
 gcc/tree-vect-loop-manip.c                    |  26 ++
 gcc/tree-vect-loop.c                          | 307 ++++++++++++++----
 gcc/tree-vectorizer.c                         |   4 +
 gcc/tree-vectorizer.h                         |  56 +++-
 18 files changed, 1297 insertions(+), 64 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c

diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e2fd3609fee..d825b0c3723 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens;
 
 typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
 
+/* Information about a reduction accumulator from the main loop that could
+   conceivably be reused as the input to a reduction in an epilogue loop.  */
+struct vect_reusable_accumulator {
+  /* The final value of the accumulator, which forms the input to the
+     reduction operation.  */
+  tree reduc_input;
+
+  /* The stmt_vec_info that describes the reduction (i.e. the one for
+     which is_reduc_info is true).  */
+  stmt_vec_info reduc_info;
+};
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -588,6 +600,26 @@ public:
   /* Unrolling factor  */
   poly_uint64 vectorization_factor;
 
+  /* If this loop is an epilogue loop whose main loop can be skipped,
+     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
+     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
+     main loop and goes straight to this loop's preheader.
+
+     Both fields are null otherwise.  */
+  edge main_loop_edge;
+  edge skip_main_loop_edge;
+
+  /* If this loop is an epilogue loop that might be skipped after executing
+     the main loop, this edge is the one that skips the epilogue.  */
+  edge skip_this_loop_edge;
+
+  /* The vectorized form of a standard reduction replaces the original
+     scalar code's final result (a loop-closed SSA PHI) with the result
+     of a vector-to-scalar reduction operation.  After vectorization,
+     this variable maps these vector-to-scalar results to information
+     about the reductions that generated them.  */
+  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
+
   /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1186,6 +1218,23 @@ public:
   /* The vector type for performing the actual reduction.  */
   tree reduc_vectype;
 
+  /* If IS_REDUC_INFO is true and if the vector code is performing
+     N scalar reductions in parallel, this variable gives the initial
+     scalar values of those N reductions.  */
+  vec<tree> reduc_initial_values;
+
+  /* If IS_REDUC_INFO is true and if the vector code is performing
+     N scalar reductions in parallel, this variable gives the vectorized code's
+     final (scalar) result for each of those N reductions.  In other words,
+     REDUC_SCALAR_RESULTS[I] replaces the original scalar code's loop-closed
+     SSA PHI for reduction number I.  */
+  vec<tree> reduc_scalar_results;
+
+  /* Only meaningful if IS_REDUC_INFO.  If non-null, the reduction is
+     being performed by an epilogue loop and we have decided to reuse
+     this accumulator from the main loop.  */
+  vect_reusable_accumulator *reused_accumulator;
+
   /* Whether we force a single cycle PHI during reduction vectorization.  */
   bool force_single_cycle;
 
@@ -1382,12 +1431,6 @@ vect_phi_initial_value (gphi *phi)
   return PHI_ARG_DEF_FROM_EDGE (phi, pe);
 }
 
-static inline tree
-vect_phi_initial_value (stmt_vec_info stmt_info)
-{
-  return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
-}
-
 /* Return true if STMT_INFO should produce a vector mask type rather than
    a normal nonmask type.  */
 
@@ -1818,6 +1861,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
 				    tree *);
+extern tree vect_get_main_loop_result (loop_vec_info, tree, tree);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 9748043f3ee..f1035a83826 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt)
   STMT_VINFO_SLP_VECT_ONLY (res) = false;
   STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false;
   STMT_VINFO_VEC_STMTS (res) = vNULL;
+  res->reduc_initial_values = vNULL;
+  res->reduc_scalar_results = vNULL;
 
   if (is_a <loop_vec_info> (this)
       && gimple_code (stmt) == GIMPLE_PHI
@@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
 	release_ssa_name (lhs);
     }
 
+  stmt_info->reduc_initial_values.release ();
+  stmt_info->reduc_scalar_results.release ();
   STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release ();
   STMT_VINFO_VEC_STMTS (stmt_info).release ();
   free (stmt_info);
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 2909e8a0fc3..c29ffb3356c 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2457,6 +2457,28 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
 }
 
+/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
+   Return a value that equals:
+
+   - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
+   - SKIP_VALUE when the main loop is skipped.  */
+
+tree
+vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
+			   tree skip_value)
+{
+  gcc_assert (loop_vinfo->main_loop_edge);
+
+  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
+  basic_block bb = loop_vinfo->main_loop_edge->dest;
+  gphi *new_phi = create_phi_node (phi_result, bb);
+  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
+	       UNKNOWN_LOCATION);
+  add_phi_arg (new_phi, skip_value,
+	       loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
+  return phi_result;
+}
+
 /* Function vect_do_peeling.
 
    Input:
@@ -2986,6 +3008,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 					   skip_vector ? anchor : guard_bb,
 					   prob_epilog.invert (),
 					   irred_flag);
+	  if (vect_epilogues)
+	    epilogue_vinfo->skip_this_loop_edge = guard_e;
 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
 					      single_exit (epilog));
 	  /* Only need to handle basic block before epilog loop if it's not
@@ -3057,6 +3081,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
 		       UNKNOWN_LOCATION);
 	  niters = PHI_RESULT (new_phi);
+	  epilogue_vinfo->main_loop_edge = update_e;
+	  epilogue_vinfo->skip_main_loop_edge = skip_e;
 	}
 
       /* Set ADVANCE to the number of iterations performed by the previous
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fe7e73f655f..8c27d75f889 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     th (0),
     versioning_threshold (0),
     vectorization_factor (0),
+    main_loop_edge (nullptr),
+    skip_main_loop_edge (nullptr),
+    skip_this_loop_edge (nullptr),
+    reusable_accumulators (),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
@@ -4607,7 +4612,32 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
                  prologue_cost, epilogue_cost);
 }
 
+/* SEQ is a sequence of instructions that initialize the reduction
+   described by REDUC_INFO.  Emit them in the appropriate place.  */
 
+static void
+vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info, gimple *seq)
+{
+  if (reduc_info->reused_accumulator)
+    {
+      /* When reusing an accumulator from the main loop, we only need
+	 initialization instructions if the main loop can be skipped.
+	 In that case, emit the initialization instructions at the end
+	 of the guard block that does the skip.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      gcc_assert (skip_edge);
+      gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
+      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+    }
+  else
+    {
+      /* The normal case: emit the initialization instructions on the
+	 preheader edge.  */
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
+    }
+}
 
 /* Function get_initial_def_for_reduction
 
@@ -4675,36 +4705,30 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
     }
 
   if (stmts)
-    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
   return init_def;
 }
 
-/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
-   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
-   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
-   that value will not change the result.  */
+/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
+   which performs a reduction involving GROUP_SIZE scalar statements.
+   NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
+   is nonnull, introducing extra elements of that value will not change the
+   result.  */
 
 static void
-get_initial_defs_for_reduction (vec_info *vinfo,
+get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
 				stmt_vec_info reduc_info,
-				slp_tree slp_node,
 				vec<tree> *vec_oprnds,
 				unsigned int number_of_vectors,
-				bool reduc_chain, tree neutral_op)
+				unsigned int group_size, tree neutral_op)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+  vec<tree> &initial_values = reduc_info->reduc_initial_values;
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
-  unsigned int group_size = stmts.length ();
   unsigned int i;
-  class loop *loop;
-
-  loop = (gimple_bb (reduc_info->stmt))->loop_father;
-  gcc_assert (loop);
-  edge pe = loop_preheader_edge (loop);
 
-  gcc_assert (!reduc_chain || neutral_op);
+  gcc_assert (group_size == initial_values.length () || neutral_op);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -4734,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
     {
       tree op;
       i = j % group_size;
-      stmt_vec_info stmt_vinfo = stmts[i];
 
       /* Get the def before the loop.  In reduction chain we have only
 	 one initial value.  Else we have as many as PHIs in the group.  */
-      if (reduc_chain)
-	op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
-      else if (((vec_oprnds->length () + 1) * nunits
-		- number_of_places_left_in_vector >= group_size)
-	       && neutral_op)
+      if (i >= initial_values.length () || (j > i && neutral_op))
 	op = neutral_op;
       else
-	op = vect_phi_initial_value (stmt_vinfo);
+	op = initial_values[i];
 
       /* Create 'vect_ = {op0,op1,...,opn}'.  */
       number_of_places_left_in_vector--;
@@ -4781,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	    {
 	      /* First time round, duplicate ELTS to fill the
 		 required number of vectors.  */
-	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
-					number_of_vectors, *vec_oprnds);
+	      duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
+					elts, number_of_vectors, *vec_oprnds);
 	      break;
 	    }
 	  vec_oprnds->quick_push (init);
@@ -4794,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	}
     }
   if (ctor_seq != NULL)
-    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
 }
 
 /* For a statement STMT_INFO taking part in a reduction operation return
@@ -4823,6 +4842,99 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
   return stmt_info;
 }
 
+/* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
+   REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
+   return false.  */
+
+static bool
+vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info)
+{
+  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+  if (!main_loop_vinfo)
+    return false;
+
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
+    return false;
+
+  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
+  auto_vec<tree, 16> main_loop_results (num_phis);
+  auto_vec<tree, 16> initial_values (num_phis);
+  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
+    {
+      /* The epilogue loop can be entered either from the main loop or
+	 from an earlier guard block.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      for (tree incoming_value : reduc_info->reduc_initial_values)
+	{
+	  /* Look for:
+
+	       INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
+				    INITIAL_VALUE(guard block)>.  */
+	  gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
+
+	  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
+	  gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
+
+	  tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
+	  tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
+
+	  main_loop_results.quick_push (from_main_loop);
+	  initial_values.quick_push (from_skip);
+	}
+    }
+  else
+    /* The main loop dominates the epilogue loop.  */
+    main_loop_results.splice (reduc_info->reduc_initial_values);
+
+  /* See if the main loop has the kind of accumulator we need.  */
+  vect_reusable_accumulator *accumulator
+    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
+  if (!accumulator
+      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
+      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
+		      accumulator->reduc_info->reduc_scalar_results.begin ()))
+    return false;
+
+  /* For now, only handle the case in which both loops are operating on the
+     same vector types.  In future we could reduce wider vectors to narrower
+     ones as well.  */
+  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
+  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
+  if (!useless_type_conversion_p (old_vectype, vectype))
+    return false;
+
+  /* Non-SLP reductions might apply an adjustment after the reduction
+     operation, in order to simplify the initialization of the accumulator.
+     If the epilogue loop carries on from where the main loop left off,
+     it should apply the same adjustment to the final reduction result.
+
+     If the epilogue loop can also be entered directly (rather than via
+     the main loop), we need to be able to handle that case in the same way,
+     with the same adjustment.  (In principle we could add a PHI node
+     to select the correct adjustment, but in practice that shouldn't be
+     necessary.)  */
+  tree main_adjustment
+    = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
+  if (loop_vinfo->main_loop_edge && main_adjustment)
+    {
+      gcc_assert (num_phis == 1);
+      tree initial_value = initial_values[0];
+      /* Check that we can use INITIAL_VALUE as the adjustment and
+	 initialize the accumulator with a neutral value instead.  */
+      if (!operand_equal_p (initial_value, main_adjustment))
+	return false;
+      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+      initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
+						    code, initial_value);
+    }
+  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
+  reduc_info->reduc_initial_values.truncate (0);
+  reduc_info->reduc_initial_values.splice (initial_values);
+  reduc_info->reused_accumulator = accumulator;
+  return true;
+}
+
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -4915,7 +5027,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   gimple *use_stmt;
   auto_vec<tree> reduc_inputs;
   int j, i;
-  auto_vec<tree> scalar_results;
+  vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
   unsigned int group_size = 1, k;
   auto_vec<gimple *> phis;
   /* SLP reduction without reduction chain, e.g.,
@@ -4941,16 +5053,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   gcc_assert (vectype);
   mode = TYPE_MODE (vectype);
 
-  tree initial_def = NULL;
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
     ;
   else
     {
-      /* Get at the scalar def before the loop, that defines the initial value
-	 of the reduction variable.  */
-      initial_def = vect_phi_initial_value (reduc_def_stmt);
       /* Optimize: for induction condition reduction, if we can't use zero
          for induc_val, use initial_def.  */
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
@@ -5196,6 +5304,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       reduc_inputs.safe_push (single_input);
     }
 
+  tree orig_reduc_input = reduc_inputs[0];
+
+  /* If this loop is an epilogue loop that can be skipped after the
+     main loop, we can only share a reduction operation between the
+     main loop and the epilogue if we put it at the target of the
+     skip edge.
+
+     We can still reuse accumulators if this check fails.  Doing so has
+     the minor(?) benefit of making the epilogue loop's scalar result
+     independent of the main loop's scalar result.  */
+  bool unify_with_main_loop_p = false;
+  if (reduc_info->reused_accumulator
+      && loop_vinfo->skip_this_loop_edge
+      && single_succ_p (exit_bb)
+      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
+    {
+      unify_with_main_loop_p = true;
+
+      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
+      reduc_inputs[0] = make_ssa_name (vectype);
+      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
+      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
+		   UNKNOWN_LOCATION);
+      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
+		   loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
+      exit_gsi = gsi_after_labels (reduc_block);
+    }
+
+  /* Shouldn't be used beyond this point.  */
+  exit_bb = nullptr;
+
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
@@ -5405,6 +5544,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5425,9 +5565,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (reduc_inputs.length () == 1);
       gcc_assert (pow2p_hwi (group_size));
 
-      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
-      vec<stmt_vec_info> orig_phis
-	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
       gimple_seq seq = NULL;
 
       /* Build a vector {0, 1, 2, ...}, with the same number of elements
@@ -5452,7 +5589,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	{
 	  tree initial_value = NULL_TREE;
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-	    initial_value = vect_phi_initial_value (orig_phis[0]);
+	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
 						 initial_value);
 	}
@@ -5466,7 +5603,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     for MIN and MAX reduction, for example.  */
 	  if (!neutral_op)
 	    {
-	      tree scalar_value = vect_phi_initial_value (orig_phis[i]);
+	      tree scalar_value = reduc_info->reduc_initial_values[i];
 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
 					     scalar_value);
 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5780,6 +5917,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tree tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5819,6 +5957,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       scalar_results[0] = new_temp;
     }
 
+  /* Record this operation if it could be reused by the epilogue loop.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
+    loop_vinfo->reusable_accumulators.put (scalar_results[0],
+					   { orig_reduc_input, reduc_info });
+
   if (double_reduc)
     loop = outer_loop;
 
@@ -5886,6 +6029,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
         {
           /* Replace the uses:  */
           orig_name = PHI_RESULT (exit_phi);
+
+	  /* Look for a single use at the target of the skip edge.  */
+	  if (unify_with_main_loop_p)
+	    {
+	      use_operand_p use_p;
+	      gimple *user;
+	      if (!single_imm_use (orig_name, &use_p, &user))
+		gcc_unreachable ();
+	      orig_name = gimple_get_lhs (user);
+	    }
+
           scalar_result = scalar_results[k];
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
@@ -7421,16 +7575,32 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       else
 	{
 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
-	  tree initial_value = NULL_TREE;
+	  vec<tree> &initial_values = reduc_info->reduc_initial_values;
+	  vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+
+	  unsigned int num_phis = stmts.length ();
 	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
-	    initial_value = vect_phi_initial_value (phi);
-	  tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
-						      code, initial_value);
-	  get_initial_defs_for_reduction (loop_vinfo, reduc_info,
-					  slp_node_instance->reduc_phis,
-					  &vec_initial_defs, vec_num,
-					  initial_value != NULL, neutral_op);
+	    num_phis = 1;
+	  initial_values.reserve (num_phis);
+	  for (unsigned int i = 0; i < num_phis; ++i)
+	    {
+	      gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
+	      initial_values.quick_push (vect_phi_initial_value (this_phi));
+	    }
+	  if (vec_num == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!initial_values.is_empty ())
+	    {
+	      tree initial_value
+		= (num_phis == 1 ? initial_values[0] : NULL_TREE);
+	      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					    code, initial_value);
+	      get_initial_defs_for_reduction (loop_vinfo, reduc_info,
+					      &vec_initial_defs, vec_num,
+					      stmts.length (), neutral_op);
+	    }
 	}
     }
   else
@@ -7438,6 +7608,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       /* Get at the scalar def before the loop, that defines the initial
 	 value of the reduction variable.  */
       tree initial_def = vect_phi_initial_value (phi);
+      reduc_info->reduc_initial_values.safe_push (initial_def);
       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
 	 and we can't use zero for induc_val, use initial_def.  Similarly
 	 for REDUC_MIN and initial_def larger than the base.  */
@@ -7474,21 +7645,30 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					   initial_def, initial_def);
       else
 	{
-	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def),
-						      code, initial_def);
-	  gcc_assert (neutral_op);
-	  /* Try to simplify the vector initialization by applying an
-	     adjustment after the reduction has been performed.  */
-	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
-	      && !operand_equal_p (neutral_op, initial_def))
+	  if (ncopies == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!reduc_info->reduc_initial_values.is_empty ())
 	    {
-	      STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def;
-	      initial_def = neutral_op;
+	      initial_def = reduc_info->reduc_initial_values[0];
+	      enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (initial_def),
+					    code, initial_def);
+	      gcc_assert (neutral_op);
+	      /* Try to simplify the vector initialization by applying an
+		 adjustment after the reduction has been performed.  */
+	      if (!reduc_info->reused_accumulator
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+		  && !operand_equal_p (neutral_op, initial_def))
+		{
+		  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
+		    = initial_def;
+		  initial_def = neutral_op;
+		}
+	      vec_initial_def
+		= get_initial_def_for_reduction (loop_vinfo, reduc_info,
+						 initial_def, neutral_op);
 	    }
-	  vec_initial_def
-	    = get_initial_def_for_reduction (loop_vinfo, reduc_info,
-					     initial_def, neutral_op);
 	}
     }
 
@@ -7499,6 +7679,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	vec_initial_defs.quick_push (vec_initial_def);
     }
 
+  if (auto *accumulator = reduc_info->reused_accumulator)
+    {
+      if (loop_vinfo->main_loop_edge)
+	vec_initial_defs[0]
+	  = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input,
+				       vec_initial_defs[0]);
+      else
+	vec_initial_defs.safe_push (accumulator->reduc_input);
+      gcc_assert (vec_initial_defs.length () == 1);
+    }
+
   /* Generate the reduction PHIs upfront.  */
   for (i = 0; i < vec_num; i++)
     {
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
new file mode 100644
index 00000000000..fb817b73d77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
new file mode 100644
index 00000000000..1dd579be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_10.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0) != 0
+      || add_loop (x, 11) != 572
+      || add_loop (x, 0x100) != 22016
+      || add_loop (x, 0xfff) != 20480
+      || max_loop (x, 0) != 0
+      || max_loop (x, 11) != 132
+      || max_loop (x, 0x100) != 65280
+      || max_loop (x, 0xfff) != 65504
+      || or_loop (x, 0) != 0
+      || or_loop (x, 11) != 0xfe
+      || or_loop (x, 0x80) != 0x7ffe
+      || or_loop (x, 0xb4) != 0x7ffe
+      || or_loop (x, 0xb5) != 0xfffe
+      || eor_loop (x, 0) != 0
+      || eor_loop (x, 11) != 0xe8
+      || eor_loop (x, 0x100) != 0xcf00
+      || eor_loop (x, 0xfff) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0) != 65535
+      || min_loop (x, 11) != 65403
+      || min_loop (x, 0x100) != 255
+      || min_loop (x, 0xfff) != 31
+      || and_loop (x, 0) != 0xffff
+      || and_loop (x, 11) != 0xff01
+      || and_loop (x, 0x80) != 0x8001
+      || and_loop (x, 0xb4) != 0x8001
+      || and_loop (x, 0xb5) != 1)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
new file mode 100644
index 00000000000..f99ef4aa865
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
new file mode 100644
index 00000000000..5b41560d2ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_11.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 42) != 20522
+      || max_loop (x, 65503) != 65504
+      || max_loop (x, 65505) != 65505
+      || or_loop (x, 0) != 0xfffe
+      || or_loop (x, 1) != 0xffff
+      || eor_loop (x, 0) != 0xa000
+      || eor_loop (x, 0xbfff) != 0x1fff)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 32) != 31
+      || min_loop (x, 30) != 30
+      || and_loop (x, 0xff) != 1
+      || and_loop (x, 0) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
new file mode 100644
index 00000000000..d32b81a61bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
new file mode 100644
index 00000000000..929b81a9705
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
@@ -0,0 +1,66 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_12.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0, 10) != 10
+      || add_loop (x, 11, 42) != 614
+      || add_loop (x, 0x100, 84) != 22100
+      || add_loop (x, 0xfff, 20) != 20500
+      || max_loop (x, 0, 10) != 10
+      || max_loop (x, 11, 131) != 132
+      || max_loop (x, 11, 133) != 133
+      || max_loop (x, 0x100, 65279) != 65280
+      || max_loop (x, 0x100, 65281) != 65281
+      || max_loop (x, 0xfff, 65503) != 65504
+      || max_loop (x, 0xfff, 65505) != 65505
+      || or_loop (x, 0, 0x71) != 0x71
+      || or_loop (x, 11, 0) != 0xfe
+      || or_loop (x, 11, 0xb3c) != 0xbfe
+      || or_loop (x, 0x80, 0) != 0x7ffe
+      || or_loop (x, 0x80, 1) != 0x7fff
+      || or_loop (x, 0xb4, 0) != 0x7ffe
+      || or_loop (x, 0xb4, 1) != 0x7fff
+      || or_loop (x, 0xb5, 0) != 0xfffe
+      || or_loop (x, 0xb5, 1) != 0xffff
+      || eor_loop (x, 0, 0x3e) != 0x3e
+      || eor_loop (x, 11, 0) != 0xe8
+      || eor_loop (x, 11, 0x1ff) != 0x117
+      || eor_loop (x, 0x100, 0) != 0xcf00
+      || eor_loop (x, 0x100, 0xeee) != 0xc1ee
+      || eor_loop (x, 0xfff, 0) != 0xa000
+      || eor_loop (x, 0xfff, 0x8888) != 0x2888)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0, 10000) != 10000
+      || min_loop (x, 11, 65404) != 65403
+      || min_loop (x, 11, 65402) != 65402
+      || min_loop (x, 0x100, 256) != 255
+      || min_loop (x, 0x100, 254) != 254
+      || min_loop (x, 0xfff, 32) != 31
+      || min_loop (x, 0xfff, 30) != 30
+      || and_loop (x, 0, 0x1234) != 0x1234
+      || and_loop (x, 11, 0xffff) != 0xff01
+      || and_loop (x, 11, 0xcdef) != 0xcd01
+      || and_loop (x, 0x80, 0xffff) != 0x8001
+      || and_loop (x, 0x80, 0xfffe) != 0x8000
+      || and_loop (x, 0xb4, 0xffff) != 0x8001
+      || and_loop (x, 0xb4, 0xfffe) != 0x8000
+      || and_loop (x, 0xb5, 0xffff) != 1
+      || and_loop (x, 0xb5, 0xfffe) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
new file mode 100644
index 00000000000..ce2b8f2fcdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
@@ -0,0 +1,101 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
new file mode 100644
index 00000000000..5514d8d6b3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_13.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res[2] = { 42, 1111 };
+  add_loop (x, add_res);
+  if (add_res[0] != 968538154
+      || add_res[1] != 964340823)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 0, 0 };
+  max_loop (x, max_res1);
+  if (max_res1[0] != 1048150
+      || max_res1[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 1048151, 1045507 };
+  max_loop (x, max_res2);
+  if (max_res2[0] != 1048151
+      || max_res2[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int or_res[2] = { 0x1000000, 0x2000000 };
+  or_loop (x, or_res);
+  if (or_res[0] != 0x10ffffe
+      || or_res[1] != 0x20ffffe)
+    __builtin_abort ();
+
+  unsigned int eor_res[2] = { 0x1000000, 0x2000000 };
+  eor_loop (x, eor_res);
+  if (eor_res[0] != 0x1010000
+      || eor_res[1] != 0x20b5000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 500, 4000 };
+  min_loop (x, min_res1);
+  if (min_res1[0] != 425
+      || min_res1[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 424, 3068 };
+  min_loop (x, min_res2);
+  if (min_res2[0] != 424
+      || min_res2[1] != 3068)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
new file mode 100644
index 00000000000..3be611e4b37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
@@ -0,0 +1,107 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
new file mode 100644
index 00000000000..ccaa770e9b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
@@ -0,0 +1,187 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_14.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res1[2] = { 11, 22 };
+  add_loop (x, 0, add_res1);
+  if (add_res1[0] != 11
+      || add_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int add_res2[2] = { 10, 20 };
+  add_loop (x, 11, add_res2);
+  if (add_res2[0] != 1902
+      || add_res2[1] != 2176)
+    __builtin_abort ();
+
+  unsigned int add_res3[2] = { 15, 30 };
+  add_loop (x, 0x100, add_res3);
+  if (add_res3[0] != 22435087
+      || add_res3[1] != 22566686)
+    __builtin_abort ();
+
+  unsigned int add_res4[2] = { 100, 200 };
+  add_loop (x, 0x11f, add_res4);
+  if (add_res4[0] != 31602244
+      || add_res4[1] != 31767656)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 461, 500 };
+  max_loop (x, 11, max_res1);
+  if (max_res1[0] != 462
+      || max_res1[1] != 506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 463, 507 };
+  max_loop (x, 11, max_res2);
+  if (max_res2[0] != 463
+      || max_res2[1] != 507)
+    __builtin_abort ();
+
+  unsigned int max_res3[2] = { 1000000, 1000000 };
+  max_loop (x, 0x200, max_res3);
+  if (max_res3[0] != 1047552
+      || max_res3[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res4[2] = { 1047553, 1045507 };
+  max_loop (x, 0x200, max_res4);
+  if (max_res4[0] != 1047553
+      || max_res4[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int max_res5[2] = { 300000, 30000 };
+  max_loop (x, 0x11f, max_res5);
+  if (max_res5[0] != 328902
+      || max_res5[1] != 330050)
+    __builtin_abort ();
+
+  unsigned int max_res6[2] = { 328903, 330051 };
+  max_loop (x, 0x11f, max_res6);
+  if (max_res6[0] != 328903
+      || max_res6[1] != 330051)
+    __builtin_abort ();
+
+  unsigned int or_res1[2] = { 11, 22 };
+  or_loop (x, 0, or_res1);
+  if (or_res1[0] != 11
+      || or_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int or_res2[2] = { 0x200000, 0xe00000 };
+  or_loop (x, 11, or_res2);
+  if (or_res2[0] != 0x2001fe
+      || or_res2[1] != 0xe001fe)
+    __builtin_abort ();
+
+  unsigned int or_res3[2] = { 0x800000, 0x700000 };
+  or_loop (x, 0x40, or_res3);
+  if (or_res3[0] != 0x803ffe
+      || or_res3[1] != 0x707ffe)
+    __builtin_abort ();
+
+  unsigned int or_res4[2] = { 0x100001, 0x300000 };
+  or_loop (x, 0x4f, or_res4);
+  if (or_res4[0] != 0x107fff
+      || or_res4[1] != 0x307ffe)
+    __builtin_abort ();
+
+  unsigned int eor_res1[2] = { 11, 22 };
+  eor_loop (x, 0, eor_res1);
+  if (eor_res1[0] != 11
+      || eor_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff };
+  eor_loop (x, 11, eor_res2);
+  if (eor_res2[0] != 0x2001cf
+      || eor_res2[1] != 0xe000b7)
+    __builtin_abort ();
+
+  unsigned int eor_res3[2] = { 0x805000, 0x70f000 };
+  eor_loop (x, 0x100, eor_res3);
+  if (eor_res3[0] != 0x824200
+      || eor_res3[1] != 0x77dc00)
+    __builtin_abort ();
+
+  unsigned int eor_res4[2] = { 0x101201, 0x300f00 };
+  eor_loop (x, 0x11f, eor_res4);
+  if (eor_res4[0] != 0x178801
+      || eor_res4[1] != 0x337240)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 1048200, 1048100 };
+  min_loop (x, 11, min_res1);
+  if (min_res1[0] != 1048113
+      || min_res1[1] != 1048069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 1048112, 1048068 };
+  min_loop (x, 11, min_res2);
+  if (min_res2[0] != 1048112
+      || min_res2[1] != 1048068)
+    __builtin_abort ();
+
+  unsigned int min_res3[2] = { 10000, 10000 };
+  min_loop (x, 0x200, min_res3);
+  if (min_res3[0] != 1023
+      || min_res3[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res4[2] = { 1022, 3068 };
+  min_loop (x, 0x200, min_res4);
+  if (min_res4[0] != 1022
+      || min_res4[1] != 3068)
+    __builtin_abort ();
+
+  unsigned int min_res5[2] = { 719680, 718530 };
+  min_loop (x, 0x11f, min_res5);
+  if (min_res5[0] != 719673
+      || min_res5[1] != 718525)
+    __builtin_abort ();
+
+  unsigned int min_res6[2] = { 719672, 718524 };
+  min_loop (x, 0x11f, min_res6);
+  if (min_res6[0] != 719672
+      || min_res6[1] != 718524)
+    __builtin_abort ();
+
+  unsigned int and_res1[2] = { 11, 22 };
+  and_loop (x, 0, and_res1);
+  if (and_res1[0] != 11
+      || and_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int and_res2[2] = { 0xf5cff, 0xf78ff };
+  and_loop (x, 11, and_res2);
+  if (and_res2[0] != 0xf5c01
+      || and_res2[1] != 0xf7801)
+    __builtin_abort ();
+
+  unsigned int and_res3[2] = { 0x7efff, 0xecfff };
+  and_loop (x, 0x40, and_res3);
+  if (and_res3[0] != 0x7c001
+      || and_res3[1] != 0xe8001)
+    __builtin_abort ();
+
+  unsigned int and_res4[2] = { 0xffffff, 0xffffff };
+  and_loop (x, 0x4f, and_res4);
+  if (and_res4[0] != 0xf8001
+      || and_res4[1] != 0xf8001)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
new file mode 100644
index 00000000000..15b1ade30e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
@@ -0,0 +1,16 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+int __attribute__((noipa))
+add_loop (int *x, int n, int res)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      res += x[i * 2];
+      res += x[i * 2 + 1];
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
new file mode 100644
index 00000000000..3207fce5be3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
@@ -0,0 +1,22 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_15.c"
+
+int
+main (void)
+{
+  int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  if (add_loop (x, 0, 33) != 33
+      || add_loop (x, 11, 30) != 4078
+      || add_loop (x, 0x100, 45) != 45001773
+      || add_loop (x, 0x11f, 300) != 63369900)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
new file mode 100644
index 00000000000..b839821d6bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
new file mode 100644
index 00000000000..aa248f53eaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_9.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x) != 20480
+      || max_loop (x) != 65504
+      || or_loop (x) != 0xfffe
+      || eor_loop (x) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x) != 31
+      || and_loop (x) != 1)
+    __builtin_abort ();
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 10/10] vect: Reuse reduction accumulators between loops
  2021-07-12 17:55         ` Richard Sandiford
@ 2021-07-13  6:09           ` Richard Biener
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Biener @ 2021-07-13  6:09 UTC (permalink / raw)
  To: Richard Biener via Gcc-patches, Richard Biener, Richard Sandiford

On Mon, Jul 12, 2021 at 7:55 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Jul 9, 2021 at 3:12 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Thanks for the review.
> >>
> >> Richard Biener <richard.guenther@gmail.com> writes:
> >> >> @@ -588,6 +600,23 @@ public:
> >> >>    /* Unrolling factor  */
> >> >>    poly_uint64 vectorization_factor;
> >> >>
> >> >> +  /* If this loop is an epilogue loop whose main loop can be skipped,
> >> >> +     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
> >> >> +     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
> >> >> +     main loop and goes straight to this loop's preheader.
> >> >> +
> >> >> +     Both fields are null otherwise.  */
> >> >> +  edge main_loop_edge;
> >> >> +  edge skip_main_loop_edge;
> >> >> +
> >> >> +  /* If this loop is an epilogue loop that might be skipped after executing
> >> >> +     the main loop, this edge is the one that skips the epilogue.  */
> >> >> +  edge skip_this_loop_edge;
> >> >> +
> >> >> +  /* After vectorization, maps live-out SSA names to information about
> >> >> +     the reductions that generated them.  */
> >> >> +  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
> >> >
> >> > Is that the LC PHI node defs or the definition inside of the loop?
> >> > If the latter we could attach the info directly to its stmt-info?
> >>
> >> Ah, yeah, I should improve the comment there.  It's the vectoriser's
> >> replacement for the original LC PHI node, i.e. the final scalar result
> >> after the reduction has taken place.
> >
> > OK
> >
> >> >> @@ -1186,6 +1215,21 @@ public:
> >> >>    /* The vector type for performing the actual reduction.  */
> >> >>    tree reduc_vectype;
> >> >>
> >> >> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
> >> >> +     elements in parallel, this vector gives the initial values of these
> >> >> +     N elements.  */
> >> >
> >> > That's N scalar elements or N vector elements?  I suppose it's for
> >> > SLP reductions (rather than SLP reduction chains) and never non-SLP
> >> > reductions?
> >>
> >> Yeah, poor wording again, sorry.  I meant something closer to:
> >>
> >>   /* If IS_REDUC_INFO is true and if the vector code is performing
> >>      N scalar reductions in parallel, this vector gives the initial
> >>      scalar values of those N reductions.  */
> >>
> >> >> +  vec<tree> reduc_initial_values;
> >> >> +
> >> >> +  /* If IS_REDUC_INFO is true and if the reduction is operating on N
> >> >> +     elements in parallel, this vector gives the scalar result of each
> >> >> +     reduction.  */
> >> >> +  vec<tree> reduc_scalar_results;
> >>
> >> Same change here.
> >>
> >> >> […]
> >> >> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> >> >> index 2909e8a0fc3..b7b0523e3c8 100644
> >> >> --- a/gcc/tree-vect-loop-manip.c
> >> >> +++ b/gcc/tree-vect-loop-manip.c
> >> >> @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
> >> >>    return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
> >> >>  }
> >> >>
> >> >> +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
> >> >> +   from the corresponding main loop.  Return a value that is available in
> >> >> +   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
> >> >> +   Passing a null SKIP_VALUE is equivalent to passing zero.  */
> >> >> +
> >> >> +tree
> >> >> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
> >> >> +                          tree skip_value)
> >> >> +{
> >> >> +  if (!loop_vinfo->main_loop_edge)
> >> >> +    return main_loop_value;
> >> >> +
> >> >> +  if (!skip_value)
> >> >> +    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
> >> >
> >> > shouldn't that be the initial value?
> >>
> >> For the current use case, the above two conditions are never true.
> >> I wrote it like this because I had a follow-on patch (which might
> >> not go anywhere) that needed this function for 0-based IVs.
> >>
> >> Maybe that's a bad risk/reward trade-off though.  Not having to pass
> >> zero makes things only slightly simpler for the follow-on patch,
> >> and I guess could be dangerous in other cases.
> >>
> >> Perhaps in that case though I should change loop_vinfo->main_loop_edge
> >> into a gcc_assert as well.
> >
> > Yeah, I think asserts (and comments in case it's because we don't handle
> > some specific cases yet) are better than possibly wrong behavior.
>
> OK.
>
> >> >> +  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
> >> >> +  basic_block bb = loop_vinfo->main_loop_edge->dest;
> >> >> +  gphi *new_phi = create_phi_node (phi_result, bb);
> >> >> +  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
> >> >> +              UNKNOWN_LOCATION);
> >> >> +  add_phi_arg (new_phi, skip_value,
> >> >> +              loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
> >> >> +  return phi_result;
> >> >> +}
> >> >> +
> >> >>  /* Function vect_do_peeling.
> >> >>
> >> >>     Input:
> >> >> […]
> >> >> @@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
> >> >>    return stmt_info;
> >> >>  }
> >> >>
> >> >> +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
> >> >> +   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
> >> >> +   matching reduction that we can build on.  Adjust REDUC_INFO and return true
> >> >> +   if so, otherwise return false.  */
> >> >> +
> >> >> +static bool
> >> >> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
> >> >> +                               stmt_vec_info reduc_info)
> >> >> +{
> >> >> +  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
> >> >> +  if (!main_loop_vinfo)
> >> >> +    return false;
> >> >> +
> >> >> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
> >> >> +    return false;
> >> >> +
> >> >> +  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
> >> >> +  auto_vec<tree, 16> main_loop_results (num_phis);
> >> >> +  auto_vec<tree, 16> initial_values (num_phis);
> >> >> +  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
> >> >> +    {
> >> >> +      /* The epilogue loop can be entered either from the main loop or
> >> >> +        from an earlier guard block.  */
> >> >> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
> >> >> +      for (tree incoming_value : reduc_info->reduc_initial_values)
> >> >> +       {
> >> >> +         /* Look for:
> >> >> +
> >> >> +              INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
> >> >> +                                   INITIAL_VALUE(guard block)>.  */
> >> >> +         gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
> >> >> +
> >> >> +         gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
> >> >> +         gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
> >> >> +
> >> >> +         tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
> >> >> +         tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
> >> >> +
> >> >> +         main_loop_results.quick_push (from_main_loop);
> >> >> +         initial_values.quick_push (from_skip);
> >> >> +       }
> >> >> +    }
> >> >> +  else
> >> >> +    /* The main loop dominates the epilogue loop.  */
> >> >> +    main_loop_results.splice (reduc_info->reduc_initial_values);
> >> >> +
> >> >> +  /* See if the main loop has the kind of accumulator we need.  */
> >> >> +  vect_reusable_accumulator *accumulator
> >> >> +    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
> >> >> +  if (!accumulator
> >> >> +      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
> >> >> +      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
> >> >> +                     accumulator->reduc_info->reduc_scalar_results.begin ()))
> >> >> +    return false;
> >> >> +
> >> >> +  /* For now, only handle the case in which both loops are operating on the
> >> >> +     same vector types.  In future we could reduce wider vectors to narrower
> >> >> +     ones as well.  */
> >> >> +  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
> >> >> +  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
> >> >> +  if (!useless_type_conversion_p (old_vectype, vectype))
> >> >
> >> > It should be indeed quite trivial to handle, likewise the case where we
> >> > have multiple PHIs - just reduce to a single input vector and have the
> >> > possibly multiple input vectors in the epilogue filled with neutral
> >> > elements.  I'll see if I can cook up stuff for this next week.
> >>
> >> Yeah, agreed.  The multi-vector epilogue case should be especially easy
> >> to handle, but it's not interesting for SVE as things stand, since:
> >>
> >> (a) non-SLP reductions use a single cycle for ncopies>1 (a misfeature
> >>     IMO -- on targets with wide pipelines we want exactly the opposite)
> >>
> >> (b) SLP reductions are limited to single vectors for variable-length targets.
> >>
> >> So it wasn't possible to trigger multiple epilogue vectors for the
> >> motivating SVE use case.
> >
> > OK, I see.  If the series is in I'll see to create testcases for x86_64.
> >
> >> >> […]
> >> >> @@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
> >> >>        reduc_inputs.safe_push (single_input);
> >> >>      }
> >> >>
> >> >> +  tree orig_reduc_input = reduc_inputs[0];
> >> >> +
> >> >> +  /* If this loop is an epilogue loop that can be skipped after the
> >> >> +     main loop, we can only share a reduction operation between the
> >> >> +     main loop and the epilogue if we put it at the target of the
> >> >> +     skip edge.
> >> >
> >> > Do you have a testcase where we cannot do this?
> >>
> >> No, it's being defensive.  I wasn't sure how the epilogue code would
> >> evolve in future.
> >>
> >> >> +     We can still reuse accumulators if this check fails.  Doing so has
> >> >> +     the minor(?) benefit of making the epilogue loop's scalar result
> >> >> +     independent of the main loop's scalar result.  */
> >> >> +  bool unify_with_main_loop_p = false;
> >> >> +  if (reduc_info->reused_accumulator
> >> >> +      && loop_vinfo->skip_this_loop_edge
> >> >> +      && single_succ_p (exit_bb)
> >> >> +      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
> >> >> +    {
> >> >> +      unify_with_main_loop_p = true;
> >> >> +
> >> >> +      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
> >> >> +      reduc_inputs[0] = make_ssa_name (vectype);
> >> >> +      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
> >> >> +      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
> >> >> +                  UNKNOWN_LOCATION);
> >> >> +      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
> >> >> +                  loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
> >> >> +      exit_gsi = gsi_after_labels (reduc_block);
> >> >> +    }
> >> >> +
> >> >> +  /* Shouldn't be used beyond this point.  */
> >> >> +  exit_bb = nullptr;
> >> >> +
> >> >>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
> >> >>        && reduc_fn != IFN_LAST)
> >> >>      {
> >> >> @@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
> >> >>        scalar_results[0] = new_temp;
> >> >>      }
> >> >>
> >> >> +  /* Record this operation if it could be reused by the epilogue loop.  */
> >> >> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
> >> >> +      && !double_reduc)
> >> >
> >> > what's the issue with double_reduc?
> >>
> >> Probably nothing TBH.  I haven't been able to construct a case that
> >> uses predicated double reductions with vect-partial-vector-usage=1,
> >> but that's probably a missed optimisation.
> >>
> >> There again, double reductions themselves seem to be hard to trigger
> >> now that we have loop interchange.  Is there a good way of testing
> >> them without -fno-loop-interchange?
> >
> > there are a bunch of testcases in gcc.dg/vect/vect-double-reduc-?.c,
> > I don't see how interchange avoids the double reduction, in fact when
> > doing interchange we no longer can apply outer loop vectorization (but
> > it's still a double reduction, just only inner loop vectorized).
> > But eventually we don't do epilogue vectorization for outer loop
> > vectorizations with reductions.
>
> Well, to take the vect-double-reduc-1.c loop:
>
>   for (k = 0; k < K; k++)
>     {
>       sum = 0;
>       for (j = 0; j < K; j++)
>         for (i = 0; i < K; i++)
>           sum += in[i+k][j] * coeff[i][j];
>
>       out[k] = sum;
>     }
>
> loop interchange converts this to:
>
>   for (k = 0; k < K; k++)
>     {
>       sum = 0;
>       for (i = 0; i < K; i++)
>         for (j = 0; j < K; j++)
>           sum += in[i+k][j] * coeff[i][j];
>
>       out[k] = sum;
>     }
>
> and then we vectorise the inner loop.
>
> I think in principle that's the right thing to do, since the double
> reduction is more like:
>
>       for (j1 = 0; j1 < K; j1 += vsize)
>         for (i = 0; i < K; i++)
>           for (j = j1; j < MIN (K, j1 + vsize); j++)
>             sum += in[i+k][j] * coeff[i][j];
>
> which isn't as nice an access pattern.

True, OTOH which version is faster probably depends on the inner
loop kernel and the number of memory streams involved.

> It would be good if we could sink the vector->scalar reduction
> in the interchanged form, but that feels like a separate optimisation,
> and could potentially happen regardless of whether we can vectorise
> any other code in the outer loop.  E.g. it could happen for:
>
>       for (i = 0; i < K; i++)
>         {
>           printf ("Hello, world!\n");
>           for (j = 0; j < K; j++)
>             sum += in[i+k][j] * coeff[i][j];
>         }

Indeed.

> > Oh, and of course vect.exp runs with -O2 -ftree-vectorize, avoiding
> > any of the high-level loop opts ...
>
> Ah, yeah, of course.  I was trying to use the vect-double-reduc*.c
> tests with -O3…
>
> How does this version look?  Changes from v1:
>
> - Fix comments in new tree-vectorizer.h fields.
> - Fix an out-of-date comment above vect_find_reusable_accumulator
> - Remove !double_reduc condition.
> - Make vect_get_main_loop_result specific to the case in which a phi
>   node is needed.
>
> Tested as above.

Looks good to me now, thus OK.

Thanks,
Richard.

> Thanks,
> Richard
>
> gcc/
>         * tree-vectorizer.h (vect_reusable_accumulator): New structure.
>         (_loop_vec_info::main_loop_edge): New field.
>         (_loop_vec_info::skip_main_loop_edge): Likewise.
>         (_loop_vec_info::skip_this_loop_edge): Likewise.
>         (_loop_vec_info::reusable_accumulators): Likewise.
>         (_stmt_vec_info::reduc_scalar_results): Likewise.
>         (_stmt_vec_info::reused_accumulator): Likewise.
>         (vect_get_main_loop_result): Declare.
>         * tree-vectorizer.c (vec_info::new_stmt_vec_info): Initialize
>         reduc_scalar_inputs.
>         (vec_info::free_stmt_vec_info): Free reduc_scalar_inputs.
>         * tree-vect-loop-manip.c (vect_get_main_loop_result): New function.
>         (vect_do_peeling): Fill an epilogue loop's main_loop_edge,
>         skip_main_loop_edge and skip_this_loop_edge fields.
>         * tree-vect-loop.c (INCLUDE_ALGORITHM): Define.
>         (vect_emit_reduction_init_stmts): New function.
>         (get_initial_def_for_reduction): Use it.
>         (get_initial_defs_for_reduction): Likewise.  Change the vinfo
>         parameter to a loop_vec_info.
>         (vect_create_epilog_for_reduction): Store the scalar results
>         in the reduc_info.  If an epilogue loop is reusing an accumulator
>         from the main loop, and if the epilogue loop can also be skipped,
>         try to place the reduction code in the join block.  Record
>         accumulators that could potentially be reused by epilogue loops.
>         (vect_transform_cycle_phi): When vectorizing epilogue loops,
>         try to reuse accumulators from the main loop.  Record the initial
>         value in reduc_info for non-SLP reductions too.
>
> gcc/testsuite/
>         * gcc.target/aarch64/sve/reduc_9.c: New test.
>         * gcc.target/aarch64/sve/reduc_9_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_10.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_10_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_11.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_11_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_12.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_12_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_13.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_13_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_14.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_14_run.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_15.c: Likewise.
>         * gcc.target/aarch64/sve/reduc_15_run.c: Likewise.
> ---
>  .../gcc.target/aarch64/sve/reduc_10.c         |  77 +++++
>  .../gcc.target/aarch64/sve/reduc_10_run.c     |  49 +++
>  .../gcc.target/aarch64/sve/reduc_11.c         |  71 ++++
>  .../gcc.target/aarch64/sve/reduc_11_run.c     |  34 ++
>  .../gcc.target/aarch64/sve/reduc_12.c         |  71 ++++
>  .../gcc.target/aarch64/sve/reduc_12_run.c     |  66 ++++
>  .../gcc.target/aarch64/sve/reduc_13.c         | 101 ++++++
>  .../gcc.target/aarch64/sve/reduc_13_run.c     |  61 ++++
>  .../gcc.target/aarch64/sve/reduc_14.c         | 107 ++++++
>  .../gcc.target/aarch64/sve/reduc_14_run.c     | 187 +++++++++++
>  .../gcc.target/aarch64/sve/reduc_15.c         |  16 +
>  .../gcc.target/aarch64/sve/reduc_15_run.c     |  22 ++
>  .../gcc.target/aarch64/sve/reduc_9.c          |  77 +++++
>  .../gcc.target/aarch64/sve/reduc_9_run.c      |  29 ++
>  gcc/tree-vect-loop-manip.c                    |  26 ++
>  gcc/tree-vect-loop.c                          | 307 ++++++++++++++----
>  gcc/tree-vectorizer.c                         |   4 +
>  gcc/tree-vectorizer.h                         |  56 +++-
>  18 files changed, 1297 insertions(+), 64 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
>
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index e2fd3609fee..d825b0c3723 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens;
>
>  typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
>
> +/* Information about a reduction accumulator from the main loop that could
> +   conceivably be reused as the input to a reduction in an epilogue loop.  */
> +struct vect_reusable_accumulator {
> +  /* The final value of the accumulator, which forms the input to the
> +     reduction operation.  */
> +  tree reduc_input;
> +
> +  /* The stmt_vec_info that describes the reduction (i.e. the one for
> +     which is_reduc_info is true).  */
> +  stmt_vec_info reduc_info;
> +};
> +
>  /*-----------------------------------------------------------------*/
>  /* Info on vectorized loops.                                       */
>  /*-----------------------------------------------------------------*/
> @@ -588,6 +600,26 @@ public:
>    /* Unrolling factor  */
>    poly_uint64 vectorization_factor;
>
> +  /* If this loop is an epilogue loop whose main loop can be skipped,
> +     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
> +     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
> +     main loop and goes straight to this loop's preheader.
> +
> +     Both fields are null otherwise.  */
> +  edge main_loop_edge;
> +  edge skip_main_loop_edge;
> +
> +  /* If this loop is an epilogue loop that might be skipped after executing
> +     the main loop, this edge is the one that skips the epilogue.  */
> +  edge skip_this_loop_edge;
> +
> +  /* The vectorized form of a standard reduction replaces the original
> +     scalar code's final result (a loop-closed SSA PHI) with the result
> +     of a vector-to-scalar reduction operation.  After vectorization,
> +     this variable maps these vector-to-scalar results to information
> +     about the reductions that generated them.  */
> +  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
> +
>    /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
>       if there is no particular limit.  */
>    unsigned HOST_WIDE_INT max_vectorization_factor;
> @@ -1186,6 +1218,23 @@ public:
>    /* The vector type for performing the actual reduction.  */
>    tree reduc_vectype;
>
> +  /* If IS_REDUC_INFO is true and if the vector code is performing
> +     N scalar reductions in parallel, this variable gives the initial
> +     scalar values of those N reductions.  */
> +  vec<tree> reduc_initial_values;
> +
> +  /* If IS_REDUC_INFO is true and if the vector code is performing
> +     N scalar reductions in parallel, this variable gives the vectorized code's
> +     final (scalar) result for each of those N reductions.  In other words,
> +     REDUC_SCALAR_RESULTS[I] replaces the original scalar code's loop-closed
> +     SSA PHI for reduction number I.  */
> +  vec<tree> reduc_scalar_results;
> +
> +  /* Only meaningful if IS_REDUC_INFO.  If non-null, the reduction is
> +     being performed by an epilogue loop and we have decided to reuse
> +     this accumulator from the main loop.  */
> +  vect_reusable_accumulator *reused_accumulator;
> +
>    /* Whether we force a single cycle PHI during reduction vectorization.  */
>    bool force_single_cycle;
>
> @@ -1382,12 +1431,6 @@ vect_phi_initial_value (gphi *phi)
>    return PHI_ARG_DEF_FROM_EDGE (phi, pe);
>  }
>
> -static inline tree
> -vect_phi_initial_value (stmt_vec_info stmt_info)
> -{
> -  return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
> -}
> -
>  /* Return true if STMT_INFO should produce a vector mask type rather than
>     a normal nonmask type.  */
>
> @@ -1818,6 +1861,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *);
>  extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
>                                     tree *, tree *, tree *, int, bool, bool,
>                                     tree *);
> +extern tree vect_get_main_loop_result (loop_vec_info, tree, tree);
>  extern void vect_prepare_for_masked_peels (loop_vec_info);
>  extern dump_user_location_t find_loop_location (class loop *);
>  extern bool vect_can_advance_ivs_p (loop_vec_info);
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index 9748043f3ee..f1035a83826 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt)
>    STMT_VINFO_SLP_VECT_ONLY (res) = false;
>    STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false;
>    STMT_VINFO_VEC_STMTS (res) = vNULL;
> +  res->reduc_initial_values = vNULL;
> +  res->reduc_scalar_results = vNULL;
>
>    if (is_a <loop_vec_info> (this)
>        && gimple_code (stmt) == GIMPLE_PHI
> @@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
>         release_ssa_name (lhs);
>      }
>
> +  stmt_info->reduc_initial_values.release ();
> +  stmt_info->reduc_scalar_results.release ();
>    STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release ();
>    STMT_VINFO_VEC_STMTS (stmt_info).release ();
>    free (stmt_info);
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 2909e8a0fc3..c29ffb3356c 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -2457,6 +2457,28 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
>    return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
>  }
>
> +/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
> +   Return a value that equals:
> +
> +   - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
> +   - SKIP_VALUE when the main loop is skipped.  */
> +
> +tree
> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
> +                          tree skip_value)
> +{
> +  gcc_assert (loop_vinfo->main_loop_edge);
> +
> +  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
> +  basic_block bb = loop_vinfo->main_loop_edge->dest;
> +  gphi *new_phi = create_phi_node (phi_result, bb);
> +  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
> +              UNKNOWN_LOCATION);
> +  add_phi_arg (new_phi, skip_value,
> +              loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
> +  return phi_result;
> +}
> +
>  /* Function vect_do_peeling.
>
>     Input:
> @@ -2986,6 +3008,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>                                            skip_vector ? anchor : guard_bb,
>                                            prob_epilog.invert (),
>                                            irred_flag);
> +         if (vect_epilogues)
> +           epilogue_vinfo->skip_this_loop_edge = guard_e;
>           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
>                                               single_exit (epilog));
>           /* Only need to handle basic block before epilog loop if it's not
> @@ -3057,6 +3081,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
>                        UNKNOWN_LOCATION);
>           niters = PHI_RESULT (new_phi);
> +         epilogue_vinfo->main_loop_edge = update_e;
> +         epilogue_vinfo->skip_main_loop_edge = skip_e;
>         }
>
>        /* Set ADVANCE to the number of iterations performed by the previous
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index fe7e73f655f..8c27d75f889 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License
>  along with GCC; see the file COPYING3.  If not see
>  <http://www.gnu.org/licenses/>.  */
>
> +#define INCLUDE_ALGORITHM
>  #include "config.h"
>  #include "system.h"
>  #include "coretypes.h"
> @@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      th (0),
>      versioning_threshold (0),
>      vectorization_factor (0),
> +    main_loop_edge (nullptr),
> +    skip_main_loop_edge (nullptr),
> +    skip_this_loop_edge (nullptr),
> +    reusable_accumulators (),
>      max_vectorization_factor (0),
>      mask_skip_niters (NULL_TREE),
>      rgroup_compare_type (NULL_TREE),
> @@ -4607,7 +4612,32 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>                   prologue_cost, epilogue_cost);
>  }
>
> +/* SEQ is a sequence of instructions that initialize the reduction
> +   described by REDUC_INFO.  Emit them in the appropriate place.  */
>
> +static void
> +vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
> +                               stmt_vec_info reduc_info, gimple *seq)
> +{
> +  if (reduc_info->reused_accumulator)
> +    {
> +      /* When reusing an accumulator from the main loop, we only need
> +        initialization instructions if the main loop can be skipped.
> +        In that case, emit the initialization instructions at the end
> +        of the guard block that does the skip.  */
> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
> +      gcc_assert (skip_edge);
> +      gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
> +      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
> +    }
> +  else
> +    {
> +      /* The normal case: emit the initialization instructions on the
> +        preheader edge.  */
> +      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> +      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
> +    }
> +}
>
>  /* Function get_initial_def_for_reduction
>
> @@ -4675,36 +4705,30 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
>      }
>
>    if (stmts)
> -    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
> +    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
>    return init_def;
>  }
>
> -/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
> -   associated SLP node is SLP_NODE.  NUMBER_OF_VECTORS is the number of vector
> -   defs to create.  If NEUTRAL_OP is nonnull, introducing extra elements of
> -   that value will not change the result.  */
> +/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
> +   which performs a reduction involving GROUP_SIZE scalar statements.
> +   NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
> +   is nonnull, introducing extra elements of that value will not change the
> +   result.  */
>
>  static void
> -get_initial_defs_for_reduction (vec_info *vinfo,
> +get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
>                                 stmt_vec_info reduc_info,
> -                               slp_tree slp_node,
>                                 vec<tree> *vec_oprnds,
>                                 unsigned int number_of_vectors,
> -                               bool reduc_chain, tree neutral_op)
> +                               unsigned int group_size, tree neutral_op)
>  {
> -  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> +  vec<tree> &initial_values = reduc_info->reduc_initial_values;
>    unsigned HOST_WIDE_INT nunits;
>    unsigned j, number_of_places_left_in_vector;
>    tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
> -  unsigned int group_size = stmts.length ();
>    unsigned int i;
> -  class loop *loop;
> -
> -  loop = (gimple_bb (reduc_info->stmt))->loop_father;
> -  gcc_assert (loop);
> -  edge pe = loop_preheader_edge (loop);
>
> -  gcc_assert (!reduc_chain || neutral_op);
> +  gcc_assert (group_size == initial_values.length () || neutral_op);
>
>    /* NUMBER_OF_COPIES is the number of times we need to use the same values in
>       created vectors. It is greater than 1 if unrolling is performed.
> @@ -4734,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>      {
>        tree op;
>        i = j % group_size;
> -      stmt_vec_info stmt_vinfo = stmts[i];
>
>        /* Get the def before the loop.  In reduction chain we have only
>          one initial value.  Else we have as many as PHIs in the group.  */
> -      if (reduc_chain)
> -       op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
> -      else if (((vec_oprnds->length () + 1) * nunits
> -               - number_of_places_left_in_vector >= group_size)
> -              && neutral_op)
> +      if (i >= initial_values.length () || (j > i && neutral_op))
>         op = neutral_op;
>        else
> -       op = vect_phi_initial_value (stmt_vinfo);
> +       op = initial_values[i];
>
>        /* Create 'vect_ = {op0,op1,...,opn}'.  */
>        number_of_places_left_in_vector--;
> @@ -4781,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>             {
>               /* First time round, duplicate ELTS to fill the
>                  required number of vectors.  */
> -             duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
> -                                       number_of_vectors, *vec_oprnds);
> +             duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
> +                                       elts, number_of_vectors, *vec_oprnds);
>               break;
>             }
>           vec_oprnds->quick_push (init);
> @@ -4794,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
>         }
>      }
>    if (ctor_seq != NULL)
> -    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
> +    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
>  }
>
>  /* For a statement STMT_INFO taking part in a reduction operation return
> @@ -4823,6 +4842,99 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
>    return stmt_info;
>  }
>
> +/* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
> +   REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
> +   return false.  */
> +
> +static bool
> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
> +                               stmt_vec_info reduc_info)
> +{
> +  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
> +  if (!main_loop_vinfo)
> +    return false;
> +
> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
> +    return false;
> +
> +  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
> +  auto_vec<tree, 16> main_loop_results (num_phis);
> +  auto_vec<tree, 16> initial_values (num_phis);
> +  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
> +    {
> +      /* The epilogue loop can be entered either from the main loop or
> +        from an earlier guard block.  */
> +      edge skip_edge = loop_vinfo->skip_main_loop_edge;
> +      for (tree incoming_value : reduc_info->reduc_initial_values)
> +       {
> +         /* Look for:
> +
> +              INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
> +                                   INITIAL_VALUE(guard block)>.  */
> +         gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
> +
> +         gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
> +         gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
> +
> +         tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
> +         tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
> +
> +         main_loop_results.quick_push (from_main_loop);
> +         initial_values.quick_push (from_skip);
> +       }
> +    }
> +  else
> +    /* The main loop dominates the epilogue loop.  */
> +    main_loop_results.splice (reduc_info->reduc_initial_values);
> +
> +  /* See if the main loop has the kind of accumulator we need.  */
> +  vect_reusable_accumulator *accumulator
> +    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
> +  if (!accumulator
> +      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
> +      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
> +                     accumulator->reduc_info->reduc_scalar_results.begin ()))
> +    return false;
> +
> +  /* For now, only handle the case in which both loops are operating on the
> +     same vector types.  In future we could reduce wider vectors to narrower
> +     ones as well.  */
> +  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
> +  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
> +  if (!useless_type_conversion_p (old_vectype, vectype))
> +    return false;
> +
> +  /* Non-SLP reductions might apply an adjustment after the reduction
> +     operation, in order to simplify the initialization of the accumulator.
> +     If the epilogue loop carries on from where the main loop left off,
> +     it should apply the same adjustment to the final reduction result.
> +
> +     If the epilogue loop can also be entered directly (rather than via
> +     the main loop), we need to be able to handle that case in the same way,
> +     with the same adjustment.  (In principle we could add a PHI node
> +     to select the correct adjustment, but in practice that shouldn't be
> +     necessary.)  */
> +  tree main_adjustment
> +    = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
> +  if (loop_vinfo->main_loop_edge && main_adjustment)
> +    {
> +      gcc_assert (num_phis == 1);
> +      tree initial_value = initial_values[0];
> +      /* Check that we can use INITIAL_VALUE as the adjustment and
> +        initialize the accumulator with a neutral value instead.  */
> +      if (!operand_equal_p (initial_value, main_adjustment))
> +       return false;
> +      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> +      initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
> +                                                   code, initial_value);
> +    }
> +  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
> +  reduc_info->reduc_initial_values.truncate (0);
> +  reduc_info->reduc_initial_values.splice (initial_values);
> +  reduc_info->reused_accumulator = accumulator;
> +  return true;
> +}
> +
>  /* Function vect_create_epilog_for_reduction
>
>     Create code at the loop-epilog to finalize the result of a reduction
> @@ -4915,7 +5027,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    gimple *use_stmt;
>    auto_vec<tree> reduc_inputs;
>    int j, i;
> -  auto_vec<tree> scalar_results;
> +  vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
>    unsigned int group_size = 1, k;
>    auto_vec<gimple *> phis;
>    /* SLP reduction without reduction chain, e.g.,
> @@ -4941,16 +5053,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>    gcc_assert (vectype);
>    mode = TYPE_MODE (vectype);
>
> -  tree initial_def = NULL;
>    tree induc_val = NULL_TREE;
>    tree adjustment_def = NULL;
>    if (slp_node)
>      ;
>    else
>      {
> -      /* Get at the scalar def before the loop, that defines the initial value
> -        of the reduction variable.  */
> -      initial_def = vect_phi_initial_value (reduc_def_stmt);
>        /* Optimize: for induction condition reduction, if we can't use zero
>           for induc_val, use initial_def.  */
>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
> @@ -5196,6 +5304,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        reduc_inputs.safe_push (single_input);
>      }
>
> +  tree orig_reduc_input = reduc_inputs[0];
> +
> +  /* If this loop is an epilogue loop that can be skipped after the
> +     main loop, we can only share a reduction operation between the
> +     main loop and the epilogue if we put it at the target of the
> +     skip edge.
> +
> +     We can still reuse accumulators if this check fails.  Doing so has
> +     the minor(?) benefit of making the epilogue loop's scalar result
> +     independent of the main loop's scalar result.  */
> +  bool unify_with_main_loop_p = false;
> +  if (reduc_info->reused_accumulator
> +      && loop_vinfo->skip_this_loop_edge
> +      && single_succ_p (exit_bb)
> +      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
> +    {
> +      unify_with_main_loop_p = true;
> +
> +      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
> +      reduc_inputs[0] = make_ssa_name (vectype);
> +      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
> +      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
> +                  UNKNOWN_LOCATION);
> +      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
> +                  loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
> +      exit_gsi = gsi_after_labels (reduc_block);
> +    }
> +
> +  /* Shouldn't be used beyond this point.  */
> +  exit_bb = nullptr;
> +
>    if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
>        && reduc_fn != IFN_LAST)
>      {
> @@ -5405,6 +5544,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              the same as initial_def already.  */
>           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
>                                   induc_val);
> +         tree initial_def = reduc_info->reduc_initial_values[0];
>
>           tmp = make_ssa_name (new_scalar_dest);
>           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
> @@ -5425,9 +5565,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (reduc_inputs.length () == 1);
>        gcc_assert (pow2p_hwi (group_size));
>
> -      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
> -      vec<stmt_vec_info> orig_phis
> -       = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
>        gimple_seq seq = NULL;
>
>        /* Build a vector {0, 1, 2, ...}, with the same number of elements
> @@ -5452,7 +5589,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>         {
>           tree initial_value = NULL_TREE;
>           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
> -           initial_value = vect_phi_initial_value (orig_phis[0]);
> +           initial_value = reduc_info->reduc_initial_values[0];
>           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
>                                                  initial_value);
>         }
> @@ -5466,7 +5603,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              for MIN and MAX reduction, for example.  */
>           if (!neutral_op)
>             {
> -             tree scalar_value = vect_phi_initial_value (orig_phis[i]);
> +             tree scalar_value = reduc_info->reduc_initial_values[i];
>               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
>                                              scalar_value);
>               vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -5780,6 +5917,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>              the same as initial_def already.  */
>           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
>                                   induc_val);
> +         tree initial_def = reduc_info->reduc_initial_values[0];
>
>           tree tmp = make_ssa_name (new_scalar_dest);
>           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
> @@ -5819,6 +5957,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        scalar_results[0] = new_temp;
>      }
>
> +  /* Record this operation if it could be reused by the epilogue loop.  */
> +  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
> +    loop_vinfo->reusable_accumulators.put (scalar_results[0],
> +                                          { orig_reduc_input, reduc_info });
> +
>    if (double_reduc)
>      loop = outer_loop;
>
> @@ -5886,6 +6029,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>          {
>            /* Replace the uses:  */
>            orig_name = PHI_RESULT (exit_phi);
> +
> +         /* Look for a single use at the target of the skip edge.  */
> +         if (unify_with_main_loop_p)
> +           {
> +             use_operand_p use_p;
> +             gimple *user;
> +             if (!single_imm_use (orig_name, &use_p, &user))
> +               gcc_unreachable ();
> +             orig_name = gimple_get_lhs (user);
> +           }
> +
>            scalar_result = scalar_results[k];
>            FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
>             {
> @@ -7421,16 +7575,32 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>        else
>         {
>           gcc_assert (slp_node == slp_node_instance->reduc_phis);
> -         tree initial_value = NULL_TREE;
> +         vec<tree> &initial_values = reduc_info->reduc_initial_values;
> +         vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> +
> +         unsigned int num_phis = stmts.length ();
>           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
> -           initial_value = vect_phi_initial_value (phi);
> -         tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> -         tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
> -                                                     code, initial_value);
> -         get_initial_defs_for_reduction (loop_vinfo, reduc_info,
> -                                         slp_node_instance->reduc_phis,
> -                                         &vec_initial_defs, vec_num,
> -                                         initial_value != NULL, neutral_op);
> +           num_phis = 1;
> +         initial_values.reserve (num_phis);
> +         for (unsigned int i = 0; i < num_phis; ++i)
> +           {
> +             gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
> +             initial_values.quick_push (vect_phi_initial_value (this_phi));
> +           }
> +         if (vec_num == 1)
> +           vect_find_reusable_accumulator (loop_vinfo, reduc_info);
> +         if (!initial_values.is_empty ())
> +           {
> +             tree initial_value
> +               = (num_phis == 1 ? initial_values[0] : NULL_TREE);
> +             tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> +             tree neutral_op
> +               = neutral_op_for_reduction (TREE_TYPE (vectype_out),
> +                                           code, initial_value);
> +             get_initial_defs_for_reduction (loop_vinfo, reduc_info,
> +                                             &vec_initial_defs, vec_num,
> +                                             stmts.length (), neutral_op);
> +           }
>         }
>      }
>    else
> @@ -7438,6 +7608,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>        /* Get at the scalar def before the loop, that defines the initial
>          value of the reduction variable.  */
>        tree initial_def = vect_phi_initial_value (phi);
> +      reduc_info->reduc_initial_values.safe_push (initial_def);
>        /* Optimize: if initial_def is for REDUC_MAX smaller than the base
>          and we can't use zero for induc_val, use initial_def.  Similarly
>          for REDUC_MIN and initial_def larger than the base.  */
> @@ -7474,21 +7645,30 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>                                            initial_def, initial_def);
>        else
>         {
> -         enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> -         tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def),
> -                                                     code, initial_def);
> -         gcc_assert (neutral_op);
> -         /* Try to simplify the vector initialization by applying an
> -            adjustment after the reduction has been performed.  */
> -         if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
> -             && !operand_equal_p (neutral_op, initial_def))
> +         if (ncopies == 1)
> +           vect_find_reusable_accumulator (loop_vinfo, reduc_info);
> +         if (!reduc_info->reduc_initial_values.is_empty ())
>             {
> -             STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def;
> -             initial_def = neutral_op;
> +             initial_def = reduc_info->reduc_initial_values[0];
> +             enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
> +             tree neutral_op
> +               = neutral_op_for_reduction (TREE_TYPE (initial_def),
> +                                           code, initial_def);
> +             gcc_assert (neutral_op);
> +             /* Try to simplify the vector initialization by applying an
> +                adjustment after the reduction has been performed.  */
> +             if (!reduc_info->reused_accumulator
> +                 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
> +                 && !operand_equal_p (neutral_op, initial_def))
> +               {
> +                 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
> +                   = initial_def;
> +                 initial_def = neutral_op;
> +               }
> +             vec_initial_def
> +               = get_initial_def_for_reduction (loop_vinfo, reduc_info,
> +                                                initial_def, neutral_op);
>             }
> -         vec_initial_def
> -           = get_initial_def_for_reduction (loop_vinfo, reduc_info,
> -                                            initial_def, neutral_op);
>         }
>      }
>
> @@ -7499,6 +7679,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
>         vec_initial_defs.quick_push (vec_initial_def);
>      }
>
> +  if (auto *accumulator = reduc_info->reused_accumulator)
> +    {
> +      if (loop_vinfo->main_loop_edge)
> +       vec_initial_defs[0]
> +         = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input,
> +                                      vec_initial_defs[0]);
> +      else
> +       vec_initial_defs.safe_push (accumulator->reduc_input);
> +      gcc_assert (vec_initial_defs.length () == 1);
> +    }
> +
>    /* Generate the reduction PHIs upfront.  */
>    for (i = 0; i < vec_num; i++)
>      {
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
> new file mode 100644
> index 00000000000..fb817b73d77
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
> @@ -0,0 +1,77 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < n; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < n; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < n; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
> new file mode 100644
> index 00000000000..1dd579be701
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_10.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x, 0) != 0
> +      || add_loop (x, 11) != 572
> +      || add_loop (x, 0x100) != 22016
> +      || add_loop (x, 0xfff) != 20480
> +      || max_loop (x, 0) != 0
> +      || max_loop (x, 11) != 132
> +      || max_loop (x, 0x100) != 65280
> +      || max_loop (x, 0xfff) != 65504
> +      || or_loop (x, 0) != 0
> +      || or_loop (x, 11) != 0xfe
> +      || or_loop (x, 0x80) != 0x7ffe
> +      || or_loop (x, 0xb4) != 0x7ffe
> +      || or_loop (x, 0xb5) != 0xfffe
> +      || eor_loop (x, 0) != 0
> +      || eor_loop (x, 11) != 0xe8
> +      || eor_loop (x, 0x100) != 0xcf00
> +      || eor_loop (x, 0xfff) != 0xa000)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x, 0) != 65535
> +      || min_loop (x, 11) != 65403
> +      || min_loop (x, 0x100) != 255
> +      || min_loop (x, 0xfff) != 31
> +      || and_loop (x, 0) != 0xffff
> +      || and_loop (x, 11) != 0xff01
> +      || and_loop (x, 0x80) != 0x8001
> +      || and_loop (x, 0xb4) != 0x8001
> +      || and_loop (x, 0xb5) != 1)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
> new file mode 100644
> index 00000000000..f99ef4aa865
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
> @@ -0,0 +1,71 @@
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, unsigned short res)
> +{
> +  for (int i = 0; i < 0xfff; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
> new file mode 100644
> index 00000000000..5b41560d2ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
> @@ -0,0 +1,34 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_11.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x, 42) != 20522
> +      || max_loop (x, 65503) != 65504
> +      || max_loop (x, 65505) != 65505
> +      || or_loop (x, 0) != 0xfffe
> +      || or_loop (x, 1) != 0xffff
> +      || eor_loop (x, 0) != 0xa000
> +      || eor_loop (x, 0xbfff) != 0x1fff)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x, 32) != 31
> +      || min_loop (x, 30) != 30
> +      || and_loop (x, 0xff) != 1
> +      || and_loop (x, 0) != 0)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
> new file mode 100644
> index 00000000000..d32b81a61bc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
> @@ -0,0 +1,71 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n, unsigned short res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
> new file mode 100644
> index 00000000000..929b81a9705
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
> @@ -0,0 +1,66 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_12.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x, 0, 10) != 10
> +      || add_loop (x, 11, 42) != 614
> +      || add_loop (x, 0x100, 84) != 22100
> +      || add_loop (x, 0xfff, 20) != 20500
> +      || max_loop (x, 0, 10) != 10
> +      || max_loop (x, 11, 131) != 132
> +      || max_loop (x, 11, 133) != 133
> +      || max_loop (x, 0x100, 65279) != 65280
> +      || max_loop (x, 0x100, 65281) != 65281
> +      || max_loop (x, 0xfff, 65503) != 65504
> +      || max_loop (x, 0xfff, 65505) != 65505
> +      || or_loop (x, 0, 0x71) != 0x71
> +      || or_loop (x, 11, 0) != 0xfe
> +      || or_loop (x, 11, 0xb3c) != 0xbfe
> +      || or_loop (x, 0x80, 0) != 0x7ffe
> +      || or_loop (x, 0x80, 1) != 0x7fff
> +      || or_loop (x, 0xb4, 0) != 0x7ffe
> +      || or_loop (x, 0xb4, 1) != 0x7fff
> +      || or_loop (x, 0xb5, 0) != 0xfffe
> +      || or_loop (x, 0xb5, 1) != 0xffff
> +      || eor_loop (x, 0, 0x3e) != 0x3e
> +      || eor_loop (x, 11, 0) != 0xe8
> +      || eor_loop (x, 11, 0x1ff) != 0x117
> +      || eor_loop (x, 0x100, 0) != 0xcf00
> +      || eor_loop (x, 0x100, 0xeee) != 0xc1ee
> +      || eor_loop (x, 0xfff, 0) != 0xa000
> +      || eor_loop (x, 0xfff, 0x8888) != 0x2888)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x, 0, 10000) != 10000
> +      || min_loop (x, 11, 65404) != 65403
> +      || min_loop (x, 11, 65402) != 65402
> +      || min_loop (x, 0x100, 256) != 255
> +      || min_loop (x, 0x100, 254) != 254
> +      || min_loop (x, 0xfff, 32) != 31
> +      || min_loop (x, 0xfff, 30) != 30
> +      || and_loop (x, 0, 0x1234) != 0x1234
> +      || and_loop (x, 11, 0xffff) != 0xff01
> +      || and_loop (x, 11, 0xcdef) != 0xcd01
> +      || and_loop (x, 0x80, 0xffff) != 0x8001
> +      || and_loop (x, 0x80, 0xfffe) != 0x8000
> +      || and_loop (x, 0xb4, 0xffff) != 0x8001
> +      || and_loop (x, 0xb4, 0xfffe) != 0x8000
> +      || and_loop (x, 0xb5, 0xffff) != 1
> +      || and_loop (x, 0xb5, 0xfffe) != 0)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
> new file mode 100644
> index 00000000000..ce2b8f2fcdc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
> @@ -0,0 +1,101 @@
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +void __attribute__((noipa))
> +add_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 += x[i * 2];
> +      res1 += x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +min_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +max_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +and_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 &= x[i * 2];
> +      res1 &= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +or_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 |= x[i * 2];
> +      res1 |= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +eor_loop (unsigned int *x, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < 0x7ff; ++i)
> +    {
> +      res0 ^= x[i * 2];
> +      res1 ^= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
> new file mode 100644
> index 00000000000..5514d8d6b3b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
> @@ -0,0 +1,61 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_13.c"
> +
> +int
> +main (void)
> +{
> +  unsigned int x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
> +
> +  unsigned int add_res[2] = { 42, 1111 };
> +  add_loop (x, add_res);
> +  if (add_res[0] != 968538154
> +      || add_res[1] != 964340823)
> +    __builtin_abort ();
> +
> +  unsigned int max_res1[2] = { 0, 0 };
> +  max_loop (x, max_res1);
> +  if (max_res1[0] != 1048150
> +      || max_res1[1] != 1045506)
> +    __builtin_abort ();
> +
> +  unsigned int max_res2[2] = { 1048151, 1045507 };
> +  max_loop (x, max_res2);
> +  if (max_res2[0] != 1048151
> +      || max_res2[1] != 1045507)
> +    __builtin_abort ();
> +
> +  unsigned int or_res[2] = { 0x1000000, 0x2000000 };
> +  or_loop (x, or_res);
> +  if (or_res[0] != 0x10ffffe
> +      || or_res[1] != 0x20ffffe)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res[2] = { 0x1000000, 0x2000000 };
> +  eor_loop (x, eor_res);
> +  if (eor_res[0] != 0x1010000
> +      || eor_res[1] != 0x20b5000)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i] & 0xfffff;
> +
> +  unsigned int min_res1[2] = { 500, 4000 };
> +  min_loop (x, min_res1);
> +  if (min_res1[0] != 425
> +      || min_res1[1] != 3069)
> +    __builtin_abort ();
> +
> +  unsigned int min_res2[2] = { 424, 3068 };
> +  min_loop (x, min_res2);
> +  if (min_res2[0] != 424
> +      || min_res2[1] != 3068)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
> new file mode 100644
> index 00000000000..3be611e4b37
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
> @@ -0,0 +1,107 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +void __attribute__((noipa))
> +add_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 += x[i * 2];
> +      res1 += x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +min_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +max_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
> +      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +and_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 &= x[i * 2];
> +      res1 &= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +or_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 |= x[i * 2];
> +      res1 |= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +void __attribute__((noipa))
> +eor_loop (unsigned int *x, int n, unsigned int *res)
> +{
> +  unsigned int res0 = res[0];
> +  unsigned int res1 = res[1];
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res0 ^= x[i * 2];
> +      res1 ^= x[i * 2 + 1];
> +    }
> +  res[0] = res0;
> +  res[1] = res1;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
> new file mode 100644
> index 00000000000..ccaa770e9b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
> @@ -0,0 +1,187 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_14.c"
> +
> +int
> +main (void)
> +{
> +  unsigned int x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
> +
> +  unsigned int add_res1[2] = { 11, 22 };
> +  add_loop (x, 0, add_res1);
> +  if (add_res1[0] != 11
> +      || add_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int add_res2[2] = { 10, 20 };
> +  add_loop (x, 11, add_res2);
> +  if (add_res2[0] != 1902
> +      || add_res2[1] != 2176)
> +    __builtin_abort ();
> +
> +  unsigned int add_res3[2] = { 15, 30 };
> +  add_loop (x, 0x100, add_res3);
> +  if (add_res3[0] != 22435087
> +      || add_res3[1] != 22566686)
> +    __builtin_abort ();
> +
> +  unsigned int add_res4[2] = { 100, 200 };
> +  add_loop (x, 0x11f, add_res4);
> +  if (add_res4[0] != 31602244
> +      || add_res4[1] != 31767656)
> +    __builtin_abort ();
> +
> +  unsigned int max_res1[2] = { 461, 500 };
> +  max_loop (x, 11, max_res1);
> +  if (max_res1[0] != 462
> +      || max_res1[1] != 506)
> +    __builtin_abort ();
> +
> +  unsigned int max_res2[2] = { 463, 507 };
> +  max_loop (x, 11, max_res2);
> +  if (max_res2[0] != 463
> +      || max_res2[1] != 507)
> +    __builtin_abort ();
> +
> +  unsigned int max_res3[2] = { 1000000, 1000000 };
> +  max_loop (x, 0x200, max_res3);
> +  if (max_res3[0] != 1047552
> +      || max_res3[1] != 1045506)
> +    __builtin_abort ();
> +
> +  unsigned int max_res4[2] = { 1047553, 1045507 };
> +  max_loop (x, 0x200, max_res4);
> +  if (max_res4[0] != 1047553
> +      || max_res4[1] != 1045507)
> +    __builtin_abort ();
> +
> +  unsigned int max_res5[2] = { 300000, 30000 };
> +  max_loop (x, 0x11f, max_res5);
> +  if (max_res5[0] != 328902
> +      || max_res5[1] != 330050)
> +    __builtin_abort ();
> +
> +  unsigned int max_res6[2] = { 328903, 330051 };
> +  max_loop (x, 0x11f, max_res6);
> +  if (max_res6[0] != 328903
> +      || max_res6[1] != 330051)
> +    __builtin_abort ();
> +
> +  unsigned int or_res1[2] = { 11, 22 };
> +  or_loop (x, 0, or_res1);
> +  if (or_res1[0] != 11
> +      || or_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int or_res2[2] = { 0x200000, 0xe00000 };
> +  or_loop (x, 11, or_res2);
> +  if (or_res2[0] != 0x2001fe
> +      || or_res2[1] != 0xe001fe)
> +    __builtin_abort ();
> +
> +  unsigned int or_res3[2] = { 0x800000, 0x700000 };
> +  or_loop (x, 0x40, or_res3);
> +  if (or_res3[0] != 0x803ffe
> +      || or_res3[1] != 0x707ffe)
> +    __builtin_abort ();
> +
> +  unsigned int or_res4[2] = { 0x100001, 0x300000 };
> +  or_loop (x, 0x4f, or_res4);
> +  if (or_res4[0] != 0x107fff
> +      || or_res4[1] != 0x307ffe)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res1[2] = { 11, 22 };
> +  eor_loop (x, 0, eor_res1);
> +  if (eor_res1[0] != 11
> +      || eor_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff };
> +  eor_loop (x, 11, eor_res2);
> +  if (eor_res2[0] != 0x2001cf
> +      || eor_res2[1] != 0xe000b7)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res3[2] = { 0x805000, 0x70f000 };
> +  eor_loop (x, 0x100, eor_res3);
> +  if (eor_res3[0] != 0x824200
> +      || eor_res3[1] != 0x77dc00)
> +    __builtin_abort ();
> +
> +  unsigned int eor_res4[2] = { 0x101201, 0x300f00 };
> +  eor_loop (x, 0x11f, eor_res4);
> +  if (eor_res4[0] != 0x178801
> +      || eor_res4[1] != 0x337240)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i] & 0xfffff;
> +
> +  unsigned int min_res1[2] = { 1048200, 1048100 };
> +  min_loop (x, 11, min_res1);
> +  if (min_res1[0] != 1048113
> +      || min_res1[1] != 1048069)
> +    __builtin_abort ();
> +
> +  unsigned int min_res2[2] = { 1048112, 1048068 };
> +  min_loop (x, 11, min_res2);
> +  if (min_res2[0] != 1048112
> +      || min_res2[1] != 1048068)
> +    __builtin_abort ();
> +
> +  unsigned int min_res3[2] = { 10000, 10000 };
> +  min_loop (x, 0x200, min_res3);
> +  if (min_res3[0] != 1023
> +      || min_res3[1] != 3069)
> +    __builtin_abort ();
> +
> +  unsigned int min_res4[2] = { 1022, 3068 };
> +  min_loop (x, 0x200, min_res4);
> +  if (min_res4[0] != 1022
> +      || min_res4[1] != 3068)
> +    __builtin_abort ();
> +
> +  unsigned int min_res5[2] = { 719680, 718530 };
> +  min_loop (x, 0x11f, min_res5);
> +  if (min_res5[0] != 719673
> +      || min_res5[1] != 718525)
> +    __builtin_abort ();
> +
> +  unsigned int min_res6[2] = { 719672, 718524 };
> +  min_loop (x, 0x11f, min_res6);
> +  if (min_res6[0] != 719672
> +      || min_res6[1] != 718524)
> +    __builtin_abort ();
> +
> +  unsigned int and_res1[2] = { 11, 22 };
> +  and_loop (x, 0, and_res1);
> +  if (and_res1[0] != 11
> +      || and_res1[1] != 22)
> +    __builtin_abort ();
> +
> +  unsigned int and_res2[2] = { 0xf5cff, 0xf78ff };
> +  and_loop (x, 11, and_res2);
> +  if (and_res2[0] != 0xf5c01
> +      || and_res2[1] != 0xf7801)
> +    __builtin_abort ();
> +
> +  unsigned int and_res3[2] = { 0x7efff, 0xecfff };
> +  and_loop (x, 0x40, and_res3);
> +  if (and_res3[0] != 0x7c001
> +      || and_res3[1] != 0xe8001)
> +    __builtin_abort ();
> +
> +  unsigned int and_res4[2] = { 0xffffff, 0xffffff };
> +  and_loop (x, 0x4f, and_res4);
> +  if (and_res4[0] != 0xf8001
> +      || and_res4[1] != 0xf8001)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
> new file mode 100644
> index 00000000000..15b1ade30e2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
> @@ -0,0 +1,16 @@
> +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
> +
> +int __attribute__((noipa))
> +add_loop (int *x, int n, int res)
> +{
> +  for (int i = 0; i < n; ++i)
> +    {
> +      res += x[i * 2];
> +      res += x[i * 2 + 1];
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
> new file mode 100644
> index 00000000000..3207fce5be3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
> @@ -0,0 +1,22 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_15.c"
> +
> +int
> +main (void)
> +{
> +  int x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
> +
> +  if (add_loop (x, 0, 33) != 33
> +      || add_loop (x, 11, 30) != 4078
> +      || add_loop (x, 0x100, 45) != 45001773
> +      || add_loop (x, 0x11f, 300) != 63369900)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
> new file mode 100644
> index 00000000000..b839821d6bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
> @@ -0,0 +1,77 @@
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res += x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res < x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res = res > x[i] ? res : x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x)
> +{
> +  unsigned short res = ~0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res &= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res |= x[i];
> +  return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x)
> +{
> +  unsigned short res = 0;
> +  for (int i = 0; i < 0xfff; ++i)
> +    res ^= x[i];
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
> new file mode 100644
> index 00000000000..aa248f53eaa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
> @@ -0,0 +1,29 @@
> +/* { dg-do run { target aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
> +
> +#define N 0x1100
> +
> +#include "reduc_9.c"
> +
> +int
> +main (void)
> +{
> +  unsigned short x[N];
> +  for (int i = 0; i < N; ++i)
> +    x[i] = (i + 1) * (i + 2);
> +
> +  if (add_loop (x) != 20480
> +      || max_loop (x) != 65504
> +      || or_loop (x) != 0xfffe
> +      || eor_loop (x) != 0xa000)
> +    __builtin_abort ();
> +
> +  for (int i = 0; i < N; ++i)
> +    x[i] = ~x[i];
> +
> +  if (min_loop (x) != 31
> +      || and_loop (x) != 1)
> +    __builtin_abort ();
> +
> +  return 0;
> +}

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 00/10] vect: Reuse reduction accumulators between loops
  2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
                   ` (9 preceding siblings ...)
  2021-07-08 12:43 ` [PATCH 10/10] vect: Reuse reduction accumulators between loops Richard Sandiford
@ 2021-07-10  2:11 ` Kewen.Lin
  2021-07-13  9:27   ` Richard Sandiford
  10 siblings, 1 reply; 30+ messages in thread
From: Kewen.Lin @ 2021-07-10  2:11 UTC (permalink / raw)
  To: richard.sandiford; +Cc: gcc-patches

Hi Richard,

on 2021/7/8 下午8:38, Richard Sandiford via Gcc-patches wrote:
> Quoting from the final patch in the series:
> 
> ------------------------------------------------------------------------
> This patch adds support for reusing a main loop's reduction accumulator
> in an epilogue loop.  This in turn lets the loops share a single piece
> of vector->scalar reduction code.
> 
> The patch has the following restrictions:
> 
> (1) The epilogue reduction can only operate on a single vector
>     (e.g. ncopies must be 1 for non-SLP reductions, and the group size
>     must be <= the element count for SLP reductions).
> 
> (2) Both loops must use the same vector mode for their accumulators.
>     This means that the patch is restricted to targets that support
>     --param vect-partial-vector-usage=1.
> 
> (3) The reduction must be a standard “tree code” reduction.
> 
> However, these restrictions could be lifted in future.  For example,
> if the main loop operates on 128-bit vectors and the epilogue loop
> operates on 64-bit vectors, we could in future reduce the 128-bit
> vector by one stage and use the 64-bit result as the starting point
> for the epilogue result.
> 
> The patch tries to handle chained SLP reductions, unchained SLP
> reductions and non-SLP reductions.  It also handles cases in which
> the epilogue loop is entered directly (rather than via the main loop)
> and cases in which the epilogue loop can be skipped.
> ------------------------------------------------------------------------
> 
> However, it ended up being difficult to do that without some preparatory
> clean-ups.  Some of them could probably stand on their own, but others
> are a bit “meh” without the final patch to justify them.
> 
> The diff below shows the effect of the patch when compiling:
> 
>   unsigned short __attribute__((noipa))
>   add_loop (unsigned short *x, int n)
>   {
>     unsigned short res = 0;
>     for (int i = 0; i < n; ++i)
>       res += x[i];
>     return res;
>   }
> 
> with -O3 --param vect-partial-vector-usage=1 on an SVE target:
> 
> add_loop:				add_loop:
> .LFB0:					.LFB0:
> 	.cfi_startproc				.cfi_startproc
> 	mov	x4, x0		      <
> 	cmp	w1, 0				cmp	w1, 0
> 	ble	.L7				ble	.L7
> 	cnth	x0		      |		cnth	x4
> 	sub	w2, w1, #1			sub	w2, w1, #1
> 	sub	w3, w0, #1	      |		sub	w3, w4, #1
> 	cmp	w2, w3				cmp	w2, w3
> 	bcc	.L8				bcc	.L8
> 	sub	w0, w1, w0	      |		sub	w4, w1, w4
> 	mov	x3, 0				mov	x3, 0
> 	cnth	x5				cnth	x5
> 	mov	z0.b, #0			mov	z0.b, #0
> 	ptrue	p0.b, all			ptrue	p0.b, all
> 	.p2align 3,,7				.p2align 3,,7
> .L4:					.L4:
> 	ld1h	z1.h, p0/z, [x4, x3,  |		ld1h	z1.h, p0/z, [x0, x3, 
> 	mov	x2, x3				mov	x2, x3
> 	add	x3, x3, x5			add	x3, x3, x5
> 	add	z0.h, z0.h, z1.h		add	z0.h, z0.h, z1.h
> 	cmp	w0, w3		      |		cmp	w4, w3
> 	bcs	.L4				bcs	.L4
> 	uaddv	d0, p0, z0.h	      <
> 	umov	w0, v0.h[0]	      <
> 	inch	x2				inch	x2
> 	and	w0, w0, 65535	      <
> 	cmp	w1, w2				cmp	w1, w2
> 	beq	.L2		      |		beq	.L6
> .L3:					.L3:
> 	sub	w1, w1, w2			sub	w1, w1, w2
> 	mov	z1.b, #0	      |		add	x2, x0, w2, uxtw 1
> 	whilelo	p0.h, wzr, w1			whilelo	p0.h, wzr, w1
> 	add	x2, x4, w2, uxtw 1    |		ld1h	z1.h, p0/z, [x2]
> 	ptrue	p1.b, all	      |		add	z0.h, p0/m, z0.h, z1.
> 	ld1h	z0.h, p0/z, [x2]      |	.L6:
> 	sel	z0.h, p0, z0.h, z1.h  |		ptrue	p0.b, all
> 	uaddv	d0, p1, z0.h	      |		uaddv	d0, p0, z0.h
> 	fmov	x1, d0		      |		umov	w0, v0.h[0]
> 	add	w0, w0, w1, uxth      <
> 	and	w0, w0, 65535			and	w0, w0, 65535
> .L2:				      <
> 	ret					ret
> 	.p2align 2,,3				.p2align 2,,3
> .L7:					.L7:
> 	mov	w0, 0				mov	w0, 0
> 	ret					ret
> .L8:					.L8:
> 	mov	w2, 0				mov	w2, 0
> 	mov	w0, 0		      |		mov	z0.b, #0
> 	b	.L3				b	.L3
> 	.cfi_endproc				.cfi_endproc
> 
> Kewen, could you give this a spin on Power 10 to see whether it
> works/helps there?  I've attached a combined diff.
> 

Thanks for the combined diff file.

I'm sorry that the current length based partial vector doesn't support
reduction, there are no conditional operations for length, we have to
preprocess the inactive lanes for the intermediate operations or final
reduction operations as operation types since the inactive lane value
is supposed to be undefined, this seems to require an efficient way to
turn length to a mask vector, Power10 doesn't have the corresponding
instruction so we have to do some tricks, it's still on my TODO list.
I did a hacking to relax the check in vectorizable_operation for
operations involved for reduction, I can see this patch series takes
effect for length based partial vector, so I believe it will help
length based partial vector once we enable it for reduction later.
Thanks for improving this!

This patch series was bootstrapped and regress-tested on Power10, also
benchmarked with SPEC2017 based on r12-2179 at Ofast unroll, no
remarkable regression and improvement was observed.

BR,
Kewen

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 00/10] vect: Reuse reduction accumulators between loops
  2021-07-10  2:11 ` [PATCH 00/10] " Kewen.Lin
@ 2021-07-13  9:27   ` Richard Sandiford
  0 siblings, 0 replies; 30+ messages in thread
From: Richard Sandiford @ 2021-07-13  9:27 UTC (permalink / raw)
  To: Kewen.Lin; +Cc: gcc-patches

"Kewen.Lin" <linkw@linux.ibm.com> writes:
> Hi Richard,
>
> on 2021/7/8 下午8:38, Richard Sandiford via Gcc-patches wrote:
>> Quoting from the final patch in the series:
>> 
>> ------------------------------------------------------------------------
>> This patch adds support for reusing a main loop's reduction accumulator
>> in an epilogue loop.  This in turn lets the loops share a single piece
>> of vector->scalar reduction code.
>> 
>> The patch has the following restrictions:
>> 
>> (1) The epilogue reduction can only operate on a single vector
>>     (e.g. ncopies must be 1 for non-SLP reductions, and the group size
>>     must be <= the element count for SLP reductions).
>> 
>> (2) Both loops must use the same vector mode for their accumulators.
>>     This means that the patch is restricted to targets that support
>>     --param vect-partial-vector-usage=1.
>> 
>> (3) The reduction must be a standard “tree code” reduction.
>> 
>> However, these restrictions could be lifted in future.  For example,
>> if the main loop operates on 128-bit vectors and the epilogue loop
>> operates on 64-bit vectors, we could in future reduce the 128-bit
>> vector by one stage and use the 64-bit result as the starting point
>> for the epilogue result.
>> 
>> The patch tries to handle chained SLP reductions, unchained SLP
>> reductions and non-SLP reductions.  It also handles cases in which
>> the epilogue loop is entered directly (rather than via the main loop)
>> and cases in which the epilogue loop can be skipped.
>> ------------------------------------------------------------------------
>> 
>> However, it ended up being difficult to do that without some preparatory
>> clean-ups.  Some of them could probably stand on their own, but others
>> are a bit “meh” without the final patch to justify them.
>> 
>> The diff below shows the effect of the patch when compiling:
>> 
>>   unsigned short __attribute__((noipa))
>>   add_loop (unsigned short *x, int n)
>>   {
>>     unsigned short res = 0;
>>     for (int i = 0; i < n; ++i)
>>       res += x[i];
>>     return res;
>>   }
>> 
>> with -O3 --param vect-partial-vector-usage=1 on an SVE target:
>> 
>> add_loop:				add_loop:
>> .LFB0:					.LFB0:
>> 	.cfi_startproc				.cfi_startproc
>> 	mov	x4, x0		      <
>> 	cmp	w1, 0				cmp	w1, 0
>> 	ble	.L7				ble	.L7
>> 	cnth	x0		      |		cnth	x4
>> 	sub	w2, w1, #1			sub	w2, w1, #1
>> 	sub	w3, w0, #1	      |		sub	w3, w4, #1
>> 	cmp	w2, w3				cmp	w2, w3
>> 	bcc	.L8				bcc	.L8
>> 	sub	w0, w1, w0	      |		sub	w4, w1, w4
>> 	mov	x3, 0				mov	x3, 0
>> 	cnth	x5				cnth	x5
>> 	mov	z0.b, #0			mov	z0.b, #0
>> 	ptrue	p0.b, all			ptrue	p0.b, all
>> 	.p2align 3,,7				.p2align 3,,7
>> .L4:					.L4:
>> 	ld1h	z1.h, p0/z, [x4, x3,  |		ld1h	z1.h, p0/z, [x0, x3, 
>> 	mov	x2, x3				mov	x2, x3
>> 	add	x3, x3, x5			add	x3, x3, x5
>> 	add	z0.h, z0.h, z1.h		add	z0.h, z0.h, z1.h
>> 	cmp	w0, w3		      |		cmp	w4, w3
>> 	bcs	.L4				bcs	.L4
>> 	uaddv	d0, p0, z0.h	      <
>> 	umov	w0, v0.h[0]	      <
>> 	inch	x2				inch	x2
>> 	and	w0, w0, 65535	      <
>> 	cmp	w1, w2				cmp	w1, w2
>> 	beq	.L2		      |		beq	.L6
>> .L3:					.L3:
>> 	sub	w1, w1, w2			sub	w1, w1, w2
>> 	mov	z1.b, #0	      |		add	x2, x0, w2, uxtw 1
>> 	whilelo	p0.h, wzr, w1			whilelo	p0.h, wzr, w1
>> 	add	x2, x4, w2, uxtw 1    |		ld1h	z1.h, p0/z, [x2]
>> 	ptrue	p1.b, all	      |		add	z0.h, p0/m, z0.h, z1.
>> 	ld1h	z0.h, p0/z, [x2]      |	.L6:
>> 	sel	z0.h, p0, z0.h, z1.h  |		ptrue	p0.b, all
>> 	uaddv	d0, p1, z0.h	      |		uaddv	d0, p0, z0.h
>> 	fmov	x1, d0		      |		umov	w0, v0.h[0]
>> 	add	w0, w0, w1, uxth      <
>> 	and	w0, w0, 65535			and	w0, w0, 65535
>> .L2:				      <
>> 	ret					ret
>> 	.p2align 2,,3				.p2align 2,,3
>> .L7:					.L7:
>> 	mov	w0, 0				mov	w0, 0
>> 	ret					ret
>> .L8:					.L8:
>> 	mov	w2, 0				mov	w2, 0
>> 	mov	w0, 0		      |		mov	z0.b, #0
>> 	b	.L3				b	.L3
>> 	.cfi_endproc				.cfi_endproc
>> 
>> Kewen, could you give this a spin on Power 10 to see whether it
>> works/helps there?  I've attached a combined diff.
>> 
>
> Thanks for the combined diff file.
>
> I'm sorry that the current length based partial vector doesn't support
> reduction, there are no conditional operations for length, we have to
> preprocess the inactive lanes for the intermediate operations or final
> reduction operations as operation types since the inactive lane value
> is supposed to be undefined, this seems to require an efficient way to
> turn length to a mask vector, Power10 doesn't have the corresponding
> instruction so we have to do some tricks, it's still on my TODO list.

Ah, yeah, I'd forgotten about that, sorry.

> I did a hacking to relax the check in vectorizable_operation for
> operations involved for reduction, I can see this patch series takes
> effect for length based partial vector, so I believe it will help
> length based partial vector once we enable it for reduction later.
> Thanks for improving this!
>
> This patch series was bootstrapped and regress-tested on Power10, also
> benchmarked with SPEC2017 based on r12-2179 at Ofast unroll, no
> remarkable regression and improvement was observed.

Thanks for the testing.

Richard

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-07-13  9:27 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-08 12:38 [PATCH 00/10] vect: Reuse reduction accumulators between loops Richard Sandiford
2021-07-08 12:39 ` [PATCH 01/10] vect: Simplify epilogue reduction code Richard Sandiford
2021-07-08 12:58   ` Richard Biener
2021-07-08 12:39 ` [PATCH 02/10] vect: Create array_slice of live-out stmts Richard Sandiford
2021-07-08 12:58   ` Richard Biener
2021-07-08 12:39 ` [PATCH 03/10] vect: Remove new_phis from Richard Sandiford
2021-07-08 12:59   ` Richard Biener
2021-07-08 12:40 ` [PATCH 04/10] vect: Ensure reduc_inputs always have vectype Richard Sandiford
2021-07-08 13:01   ` Richard Biener
2021-07-13  9:26     ` Richard Sandiford
2021-07-08 12:40 ` [PATCH 05/10] vect: Add a vect_phi_initial_value helper function Richard Sandiford
2021-07-08 13:05   ` Richard Biener
2021-07-08 13:12     ` Richard Sandiford
2021-07-08 12:40 ` [PATCH 06/10] vect: Pass reduc_info to get_initial_defs_for_reduction Richard Sandiford
2021-07-08 13:10   ` Richard Biener
2021-07-08 16:48     ` Richard Sandiford
2021-07-09 11:33       ` Richard Biener
2021-07-08 12:41 ` [PATCH 07/10] vect: Pass reduc_info to get_initial_def_for_reduction Richard Sandiford
2021-07-08 12:41 ` [PATCH 08/10] vect: Generalise neutral_op_for_slp_reduction Richard Sandiford
2021-07-08 13:13   ` Richard Biener
2021-07-08 12:41 ` [PATCH 09/10] vect: Simplify get_initial_def_for_reduction Richard Sandiford
2021-07-08 13:14   ` Richard Biener
2021-07-08 12:43 ` [PATCH 10/10] vect: Reuse reduction accumulators between loops Richard Sandiford
2021-07-09 11:58   ` Richard Biener
2021-07-09 13:12     ` Richard Sandiford
2021-07-12  6:32       ` Richard Biener
2021-07-12 17:55         ` Richard Sandiford
2021-07-13  6:09           ` Richard Biener
2021-07-10  2:11 ` [PATCH 00/10] " Kewen.Lin
2021-07-13  9:27   ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).