Updated the patch for single-lane slp node support. And determine input vectype for reduction PHI during traversal of reduction statements. Thanks, Feng --- gcc/ PR tree-optimization/114440 * tree-vectorizer.h (vectorizable_lane_reducing): New function declaration. * tree-vect-stmts.cc (vect_analyze_stmt): Call new function vectorizable_lane_reducing to analyze lane-reducing operation. * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation code related to emulated_mixed_dot_prod. (vect_reduction_update_partial_vector_usage): Compute ncopies as the original means for single-lane slp node. (vectorizable_lane_reducing): New function. (vectorizable_reduction): Allow multiple lane-reducing operations in loop reduction. Move some original lane-reducing related code to vectorizable_lane_reducing. (vect_transform_reduction): Extend transformation to support reduction statements with mixed input vectypes. gcc/testsuite/ PR tree-optimization/114440 * gcc.dg/vect/vect-reduc-chain-1.c * gcc.dg/vect/vect-reduc-chain-2.c * gcc.dg/vect/vect-reduc-chain-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c * gcc.dg/vect/vect-reduc-dot-slp-1.c temp --- .../gcc.dg/vect/vect-reduc-chain-1.c | 62 +++ .../gcc.dg/vect/vect-reduc-chain-2.c | 77 +++ .../gcc.dg/vect/vect-reduc-chain-3.c | 66 +++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c | 95 ++++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c | 67 +++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c | 79 +++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c | 63 +++ .../gcc.dg/vect/vect-reduc-dot-slp-1.c | 35 ++ gcc/tree-vect-loop.cc | 501 ++++++++++++------ gcc/tree-vect-stmts.cc | 2 + gcc/tree-vectorizer.h | 2 + 11 files changed, 888 insertions(+), 161 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c new file mode 100644 index 00000000000..04bfc419dbd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c @@ -0,0 +1,62 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_2 char *restrict c, + SIGNEDNESS_2 char *restrict d, + SIGNEDNESS_1 int *restrict e) +{ + for (int i = 0; i < N; ++i) + { + res += a[i] * b[i]; + res += c[i] * d[i]; + res += e[i]; + } + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_2 char c[N], d[N]; + SIGNEDNESS_1 int e[N]; + int expected = 0x12345; + for (int i = 0; i < N; ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + c[i] = BASE + i * 2; + d[i] = BASE + OFFSET + i * 3; + e[i] = i; + asm volatile ("" ::: "memory"); + expected += a[i] * b[i]; + expected += c[i] * d[i]; + expected += e[i]; + } + if (f (0x12345, a, b, c, d, e) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c new file mode 100644 index 00000000000..6c803b80120 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c @@ -0,0 +1,77 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 unsigned +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +fn (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_3 char *restrict c, + SIGNEDNESS_3 char *restrict d, + SIGNEDNESS_4 short *restrict e, + SIGNEDNESS_4 short *restrict f, + SIGNEDNESS_1 int *restrict g) +{ + for (int i = 0; i < N; ++i) + { + res += a[i] * b[i]; + res += i + 1; + res += c[i] * d[i]; + res += e[i] * f[i]; + res += g[i]; + } + return res; +} + +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4) +#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_3 char c[N], d[N]; + SIGNEDNESS_4 short e[N], f[N]; + SIGNEDNESS_1 int g[N]; + int expected = 0x12345; + for (int i = 0; i < N; ++i) + { + a[i] = BASE2 + i * 5; + b[i] = BASE2 + OFFSET + i * 4; + c[i] = BASE3 + i * 2; + d[i] = BASE3 + OFFSET + i * 3; + e[i] = BASE4 + i * 6; + f[i] = BASE4 + OFFSET + i * 5; + g[i] = i; + asm volatile ("" ::: "memory"); + expected += a[i] * b[i]; + expected += i + 1; + expected += c[i] * d[i]; + expected += e[i] * f[i]; + expected += g[i]; + } + if (fn (0x12345, a, b, c, d, e, f, g) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c new file mode 100644 index 00000000000..a41e4b176c4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c @@ -0,0 +1,66 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 unsigned +#define SIGNEDNESS_3 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_3 short *restrict c, + SIGNEDNESS_3 short *restrict d, + SIGNEDNESS_1 int *restrict e) +{ + for (int i = 0; i < N; ++i) + { + short diff = a[i] - b[i]; + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; + res += abs; + res += c[i] * d[i]; + res += e[i]; + } + return res; +} + +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_3 short c[N], d[N]; + SIGNEDNESS_1 int e[N]; + int expected = 0x12345; + for (int i = 0; i < N; ++i) + { + a[i] = BASE2 + i * 5; + b[i] = BASE2 - i * 4; + c[i] = BASE3 + i * 2; + d[i] = BASE3 + OFFSET + i * 3; + e[i] = i; + asm volatile ("" ::: "memory"); + short diff = a[i] - b[i]; + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; + expected += abs; + expected += c[i] * d[i]; + expected += e[i]; + } + if (f (0x12345, a, b, c, d, e) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c new file mode 100644 index 00000000000..c2831fbcc8e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c @@ -0,0 +1,95 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *a, + SIGNEDNESS_2 char *b, + int step, int n) +{ + for (int i = 0; i < n; i++) + { + res += a[0] * b[0]; + res += a[1] * b[1]; + res += a[2] * b[2]; + res += a[3] * b[3]; + res += a[4] * b[4]; + res += a[5] * b[5]; + res += a[6] * b[6]; + res += a[7] * b[7]; + res += a[8] * b[8]; + res += a[9] * b[9]; + res += a[10] * b[10]; + res += a[11] * b[11]; + res += a[12] * b[12]; + res += a[13] * b[13]; + res += a[14] * b[14]; + res += a[15] * b[15]; + + a += step; + b += step; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[100], b[100]; + int expected = 0x12345; + int step = 16; + int n = 2; + int t = 0; + + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + asm volatile ("" ::: "memory"); + } + + for (int i = 0; i < n; i++) + { + asm volatile ("" ::: "memory"); + expected += a[t + 0] * b[t + 0]; + expected += a[t + 1] * b[t + 1]; + expected += a[t + 2] * b[t + 2]; + expected += a[t + 3] * b[t + 3]; + expected += a[t + 4] * b[t + 4]; + expected += a[t + 5] * b[t + 5]; + expected += a[t + 6] * b[t + 6]; + expected += a[t + 7] * b[t + 7]; + expected += a[t + 8] * b[t + 8]; + expected += a[t + 9] * b[t + 9]; + expected += a[t + 10] * b[t + 10]; + expected += a[t + 11] * b[t + 11]; + expected += a[t + 12] * b[t + 12]; + expected += a[t + 13] * b[t + 13]; + expected += a[t + 14] * b[t + 14]; + expected += a[t + 15] * b[t + 15]; + t += step; + } + + if (f (0x12345, a, b, step, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c new file mode 100644 index 00000000000..4114264a364 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c @@ -0,0 +1,67 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *a, + SIGNEDNESS_2 char *b, + int n) +{ + for (int i = 0; i < n; i++) + { + res += a[5 * i + 0] * b[5 * i + 0]; + res += a[5 * i + 1] * b[5 * i + 1]; + res += a[5 * i + 2] * b[5 * i + 2]; + res += a[5 * i + 3] * b[5 * i + 3]; + res += a[5 * i + 4] * b[5 * i + 4]; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[100], b[100]; + int expected = 0x12345; + int n = 18; + + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + asm volatile ("" ::: "memory"); + } + + for (int i = 0; i < n; i++) + { + asm volatile ("" ::: "memory"); + expected += a[5 * i + 0] * b[5 * i + 0]; + expected += a[5 * i + 1] * b[5 * i + 1]; + expected += a[5 * i + 2] * b[5 * i + 2]; + expected += a[5 * i + 3] * b[5 * i + 3]; + expected += a[5 * i + 4] * b[5 * i + 4]; + } + + if (f (0x12345, a, b, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 5 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c new file mode 100644 index 00000000000..2cdecc36d16 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c @@ -0,0 +1,79 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 short *a, + SIGNEDNESS_2 short *b, + int step, int n) +{ + for (int i = 0; i < n; i++) + { + res += a[0] * b[0]; + res += a[1] * b[1]; + res += a[2] * b[2]; + res += a[3] * b[3]; + res += a[4] * b[4]; + res += a[5] * b[5]; + res += a[6] * b[6]; + res += a[7] * b[7]; + + a += step; + b += step; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 short a[100], b[100]; + int expected = 0x12345; + int step = 8; + int n = 2; + int t = 0; + + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + asm volatile ("" ::: "memory"); + } + + for (int i = 0; i < n; i++) + { + asm volatile ("" ::: "memory"); + expected += a[t + 0] * b[t + 0]; + expected += a[t + 1] * b[t + 1]; + expected += a[t + 2] * b[t + 2]; + expected += a[t + 3] * b[t + 3]; + expected += a[t + 4] * b[t + 4]; + expected += a[t + 5] * b[t + 5]; + expected += a[t + 6] * b[t + 6]; + expected += a[t + 7] * b[t + 7]; + t += step; + } + + if (f (0x12345, a, b, step, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c new file mode 100644 index 00000000000..32c0f30c77b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c @@ -0,0 +1,63 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 short *a, + SIGNEDNESS_2 short *b, + int n) +{ + for (int i = 0; i < n; i++) + { + res += a[3 * i + 0] * b[3 * i + 0]; + res += a[3 * i + 1] * b[3 * i + 1]; + res += a[3 * i + 2] * b[3 * i + 2]; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 short a[100], b[100]; + int expected = 0x12345; + int n = 18; + + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + asm volatile ("" ::: "memory"); + } + + for (int i = 0; i < n; i++) + { + asm volatile ("" ::: "memory"); + expected += a[3 * i + 0] * b[3 * i + 0]; + expected += a[3 * i + 1] * b[3 * i + 1]; + expected += a[3 * i + 2] * b[3 * i + 2]; + } + + if (f (0x12345, a, b, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 3 "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c new file mode 100644 index 00000000000..e17d6291f75 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c @@ -0,0 +1,35 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-do compile } */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res0, + SIGNEDNESS_1 int res1, + SIGNEDNESS_1 int res2, + SIGNEDNESS_1 int res3, + SIGNEDNESS_2 short *a, + SIGNEDNESS_2 short *b) +{ + for (int i = 0; i < 64; i += 4) + { + res0 += a[i + 0] * b[i + 0]; + res1 += a[i + 1] * b[i + 1]; + res2 += a[i + 2] * b[i + 2]; + res3 += a[i + 3] * b[i + 3]; + } + + return res0 ^ res1 ^ res2 ^ res3; +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "vect" } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 35c50eb72cb..fb9259d115c 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -5324,8 +5324,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, if (!gimple_extract_op (orig_stmt_info->stmt, &op)) gcc_unreachable (); - bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); - if (reduction_type == EXTRACT_LAST_REDUCTION) /* No extra instructions are needed in the prologue. The loop body operations are costed in vectorizable_condition. */ @@ -5360,12 +5358,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, initial result of the data reduction, initial value of the index reduction. */ prologue_stmts = 4; - else if (emulated_mixed_dot_prod) - /* We need the initial reduction value and two invariants: - one that contains the minimum signed value and one that - contains half of its negative. */ - prologue_stmts = 3; else + /* We need the initial reduction value. */ prologue_stmts = 1; prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, scalar_to_vec, stmt_info, 0, @@ -7466,7 +7460,7 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo, vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); unsigned nvectors; - if (slp_node) + if (slp_node && SLP_TREE_LANES (slp_node) > 1) nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); else nvectors = vect_get_num_copies (loop_vinfo, vectype_in); @@ -7478,6 +7472,150 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo, } } +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in + the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC. + Now there are three such kinds of operations: dot-prod/widen-sum/sad + (sum-of-absolute-differences). + + For a lane-reducing operation, the loop reduction path that it lies in, + may contain normal operation, or other lane-reducing operation of different + input type size, an example as: + + int sum = 0; + for (i) + { + ... + sum += d0[i] * d1[i]; // dot-prod + sum += w[i]; // widen-sum + sum += abs(s0[i] - s1[i]); // sad + sum += n[i]; // normal + ... + } + + Vectorization factor is essentially determined by operation whose input + vectype has the most lanes ("vector(16) char" in the example), while we + need to choose input vectype with the least lanes ("vector(4) int" in the + example) for the reduction PHI statement. */ + +bool +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, + slp_tree slp_node, stmt_vector_for_cost *cost_vec) +{ + gassign *stmt = dyn_cast (stmt_info->stmt); + if (!stmt) + return false; + + enum tree_code code = gimple_assign_rhs_code (stmt); + + if (!lane_reducing_op_p (code)) + return false; + + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); + + if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type)) + return false; + + /* Do not try to vectorize bit-precision reductions. */ + if (!type_has_mode_precision_p (type)) + return false; + + for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++) + { + stmt_vec_info def_stmt_info; + slp_tree slp_op; + tree op; + tree vectype; + enum vect_def_type dt; + + if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op, + &slp_op, &dt, &vectype, &def_stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "use not simple.\n"); + return false; + } + + if (!vectype) + { + vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op), + slp_op); + if (!vectype) + return false; + } + + if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } + + if (i == STMT_VINFO_REDUC_IDX (stmt_info)) + continue; + + /* There should be at most one cycle def in the stmt. */ + if (VECTORIZABLE_CYCLE_DEF (dt)) + return false; + } + + stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); + + /* TODO: Support lane-reducing operation that does not directly participate + in loop reduction. */ + if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0) + return false; + + /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not + recoginized. */ + gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def); + gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION); + + tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); + int ncopies_for_cost; + + if (slp_node && SLP_TREE_LANES (slp_node) > 1) + { + /* Now lane-reducing operations in a non-single-lane slp node should only + come from the same loop reduction path. */ + gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info)); + ncopies_for_cost = 1; + } + else + { + ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in); + gcc_assert (ncopies_for_cost >= 1); + } + + if (vect_is_emulated_mixed_dot_prod (stmt_info)) + { + /* We need extra two invariants: one that contains the minimum signed + value and one that contains half of its negative. */ + int prologue_stmts = 2; + unsigned cost = record_stmt_cost (cost_vec, prologue_stmts, + scalar_to_vec, stmt_info, 0, + vect_prologue); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "vectorizable_lane_reducing: " + "extra prologue_cost = %d .\n", cost); + + /* Three dot-products and a subtraction. */ + ncopies_for_cost *= 4; + } + + record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0, + vect_body); + + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info, + slp_node, code, type, + vectype_in); + + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; +} + /* Function vectorizable_reduction. Check if STMT_INFO performs a reduction operation that can be vectorized. @@ -7643,7 +7781,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo, { stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); stmt_vec_info vdef = vect_stmt_to_vectorize (def); - if (STMT_VINFO_REDUC_IDX (vdef) == -1) + int reduc_idx = STMT_VINFO_REDUC_IDX (vdef); + + if (reduc_idx == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -7689,10 +7829,43 @@ vectorizable_reduction (loop_vec_info loop_vinfo, return false; } } - else if (!stmt_info) - /* First non-conversion stmt. */ - stmt_info = vdef; - reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; + else + { + /* First non-conversion stmt. */ + if (!stmt_info) + stmt_info = vdef; + + if (lane_reducing_op_p (op.code)) + { + unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0; + tree op_type = TREE_TYPE (op.ops[0]); + tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo, + op_type, + group_size); + + /* The last operand of lane-reducing operation must be addend + for reduction. */ + gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1); + + if (!new_vectype_in) + return false; + + STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in; + + /* To accommodate lane-reducing operations of mixed input + vectypes, choose input vectype with the least lanes for the + reduction PHI statement, which would result in the most + ncopies for vectorized reduction results. */ + if (!vectype_in + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) + < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type)))) + vectype_in = new_vectype_in; + } + else + vectype_in = STMT_VINFO_VECTYPE (phi_info); + } + + reduc_def = op.ops[reduc_idx]; reduc_chain_length++; if (!stmt_info && slp_node) slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; @@ -7750,6 +7923,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; + gimple_match_op op; if (!gimple_extract_op (stmt_info->stmt, &op)) gcc_unreachable (); @@ -7763,18 +7938,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (!type_has_mode_precision_p (op.type)) return false; - /* For lane-reducing ops we're reducing the number of reduction PHIs - which means the only use of that may be in the lane-reducing operation. */ - if (lane_reducing - && reduc_chain_length != 1 - && !only_slp_reduc_chain) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "lane-reducing reduction with extra stmts.\n"); - return false; - } - /* Lane-reducing ops also never can be used in a SLP reduction group since we'll mix lanes belonging to different reductions. But it's OK to use them in a reduction chain or when the reduction group @@ -7818,9 +7981,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "use not simple.\n"); return false; } - if (i == STMT_VINFO_REDUC_IDX (stmt_info)) - continue; - /* For an IFN_COND_OP we might hit the reduction definition operand twice (once as definition, once as else). */ if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) @@ -7836,16 +7996,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op.ops[i]), slp_op[i]); - /* To properly compute ncopies we are interested in the widest - non-reduction input type in case we're looking at a widening - accumulation that we later handle in vect_transform_reduction. */ - if (lane_reducing - && vectype_op[i] - && (!vectype_in - || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) - < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i])))))) - vectype_in = vectype_op[i]; - /* Record how the non-reduction-def value of COND_EXPR is defined. ??? For a chain of multiple CONDs we'd have to match them up all. */ if (op.code == COND_EXPR && reduc_chain_length == 1) @@ -7864,19 +8014,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } } } - if (!vectype_in) - vectype_in = STMT_VINFO_VECTYPE (phi_info); - STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; - - /* Each lane-reducing operation has its own input vectype, while reduction - PHI records the input vectype with least lanes. */ - if (lane_reducing) - STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; - enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); - STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; + enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info); + STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type; /* If we have a condition reduction, see if we can simplify it further. */ - if (v_reduc_type == COND_REDUCTION) + if (reduction_type == COND_REDUCTION) { if (slp_node && SLP_TREE_LANES (slp_node) != 1) return false; @@ -8042,8 +8184,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; + reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); - vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); if (reduction_type == TREE_CODE_REDUCTION) { /* Check whether it's ok to change the order of the computation. @@ -8329,14 +8471,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, && loop_vinfo->suggested_unroll_factor == 1) single_defuse_cycle = true; - if (single_defuse_cycle || lane_reducing) + if (single_defuse_cycle && !lane_reducing) { gcc_assert (op.code != COND_EXPR); - /* 4. Supportable by target? */ - bool ok = true; - - /* 4.1. check support for the operation in the loop + /* 4. check support for the operation in the loop This isn't necessary for the lane reduction codes, since they can only be produced by pattern matching, and it's up to the @@ -8345,14 +8484,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, mixed-sign dot-products can be implemented using signed dot-products. */ machine_mode vec_mode = TYPE_MODE (vectype_in); - if (!lane_reducing - && !directly_supported_p (op.code, vectype_in, optab_vector)) + if (!directly_supported_p (op.code, vectype_in, optab_vector)) { if (dump_enabled_p ()) dump_printf (MSG_NOTE, "op not supported by target.\n"); if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) || !vect_can_vectorize_without_simd_p (op.code)) - ok = false; + single_defuse_cycle = false; else if (dump_enabled_p ()) dump_printf (MSG_NOTE, "proceeding using word mode.\n"); @@ -8365,35 +8503,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo, dump_printf (MSG_NOTE, "using word mode not possible.\n"); return false; } - - /* lane-reducing operations have to go through vect_transform_reduction. - For the other cases try without the single cycle optimization. */ - if (!ok) - { - if (lane_reducing) - return false; - else - single_defuse_cycle = false; - } } STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; - /* If the reduction stmt is one of the patterns that have lane - reduction embedded we cannot handle the case of ! single_defuse_cycle. */ - if ((ncopies > 1 && ! single_defuse_cycle) - && lane_reducing) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "multi def-use cycle not possible for lane-reducing " - "reduction operation\n"); - return false; - } - - if (slp_node - && !(!single_defuse_cycle - && !lane_reducing - && reduction_type != FOLD_LEFT_REDUCTION)) + /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the + below processing will be done in its own vectorizable function. */ + if (slp_node && reduction_type == FOLD_LEFT_REDUCTION) for (i = 0; i < (int) op.num_ops; i++) if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) { @@ -8406,28 +8521,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, reduction_type, ncopies, cost_vec); /* Cost the reduction op inside the loop if transformed via - vect_transform_reduction. Otherwise this is costed by the - separate vectorizable_* routines. */ - if (single_defuse_cycle || lane_reducing) - { - int factor = 1; - if (vect_is_emulated_mixed_dot_prod (stmt_info)) - /* Three dot-products and a subtraction. */ - factor = 4; - record_stmt_cost (cost_vec, ncopies * factor, vector_stmt, - stmt_info, 0, vect_body); - } + vect_transform_reduction for non-lane-reducing operation. Otherwise + this is costed by the separate vectorizable_* routines. */ + if (single_defuse_cycle && !lane_reducing) + record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body); if (dump_enabled_p () && reduction_type == FOLD_LEFT_REDUCTION) dump_printf_loc (MSG_NOTE, vect_location, "using an in-order (fold-left) reduction.\n"); STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; - /* All but single defuse-cycle optimized, lane-reducing and fold-left - reductions go through their own vectorizable_* routines. */ - if (!single_defuse_cycle - && !lane_reducing - && reduction_type != FOLD_LEFT_REDUCTION) + + /* All but single defuse-cycle optimized and fold-left reductions go + through their own vectorizable_* routines. */ + if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION) + || lane_reducing) { stmt_vec_info tem = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); @@ -8533,6 +8641,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); int i; int ncopies; + int stmt_ncopies; int vec_num; stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); @@ -8556,15 +8665,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo, gphi *reduc_def_phi = as_a (phi_info->stmt); int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); + tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); - if (slp_node) + /* Get input vectypes from the reduction PHI and the statement to be + transformed, these two vectypes may have different lanes when + lane-reducing operation is present. */ + if (!vectype_in) + vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info); + + if (!stmt_vectype_in) + stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info); + + if (slp_node && SLP_TREE_LANES (slp_node) > 1) { ncopies = 1; + stmt_ncopies = 1; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } else { ncopies = vect_get_num_copies (loop_vinfo, vectype_in); + stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in); + gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies); vec_num = 1; } @@ -8573,14 +8695,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo, vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); - bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); - + bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, + stmt_vectype_in); /* Transform. */ - tree new_temp = NULL_TREE; - auto_vec vec_oprnds0; - auto_vec vec_oprnds1; - auto_vec vec_oprnds2; - tree def0; + auto_vec vec_oprnds[3]; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); @@ -8604,8 +8722,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo, == op.ops[internal_fn_else_index ((internal_fn) code)])); } - bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); - vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); if (reduction_type == FOLD_LEFT_REDUCTION) { @@ -8613,7 +8729,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, gcc_assert (code.is_tree_code () || cond_fn_p); return vectorize_fold_left_reduction (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, - code, reduc_fn, op.ops, op.num_ops, vectype_in, + code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in, reduc_index, masks, lens); } @@ -8624,55 +8740,124 @@ vect_transform_reduction (loop_vec_info loop_vinfo, tree scalar_dest = gimple_get_lhs (stmt_info->stmt); tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); - /* Get NCOPIES vector definitions for all operands except the reduction - definition. */ - if (!cond_fn_p) + gcc_assert (reduc_index < 3); + + if (slp_node && SLP_TREE_LANES (slp_node) > 1) { - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, - single_defuse_cycle && reduc_index == 0 - ? NULL_TREE : op.ops[0], &vec_oprnds0, - single_defuse_cycle && reduc_index == 1 - ? NULL_TREE : op.ops[1], &vec_oprnds1, - op.num_ops == 3 - && !(single_defuse_cycle && reduc_index == 2) - ? op.ops[2] : NULL_TREE, &vec_oprnds2); + gcc_assert (!single_defuse_cycle); + + for (i = 0; i < MIN (3, (int) op.num_ops); i++) + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]); } else { - /* For a conditional operation pass the truth type as mask - vectype. */ - gcc_assert (single_defuse_cycle - && (reduc_index == 1 || reduc_index == 2)); - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, - op.ops[0], truth_type_for (vectype_in), &vec_oprnds0, - reduc_index == 1 ? NULL_TREE : op.ops[1], - NULL_TREE, &vec_oprnds1, - reduc_index == 2 ? NULL_TREE : op.ops[2], - NULL_TREE, &vec_oprnds2); - } + /* The input vectype of the reduction PHI determines copies of + vectorized def-use cycles, which might be more than effective copies + of vectorized lane-reducing reduction statements. This could be + complemented by generating extra trivial pass-through copies. For + example: + + int sum = 0; + for (i) + { + sum += d0[i] * d1[i]; // dot-prod + sum += abs(s0[i] - s1[i]); // sad + sum += n[i]; // normal + } + + The vector size is 128-bit?vectorization factor is 16. Reduction + statements would be transformed as: + + vector<4> int sum_v0 = { 0, 0, 0, 0 }; + vector<4> int sum_v1 = { 0, 0, 0, 0 }; + vector<4> int sum_v2 = { 0, 0, 0, 0 }; + vector<4> int sum_v3 = { 0, 0, 0, 0 }; + + for (i / 16) + { + sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); + sum_v1 = sum_v1; // copy + sum_v2 = sum_v2; // copy + sum_v3 = sum_v3; // copy + + sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); + sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); + sum_v2 = sum_v2; // copy + sum_v3 = sum_v3; // copy + + sum_v0 += n_v0[i: 0 ~ 3 ]; + sum_v1 += n_v1[i: 4 ~ 7 ]; + sum_v2 += n_v2[i: 8 ~ 11]; + sum_v3 += n_v3[i: 12 ~ 15]; + } + */ + + for (i = 0; i < MIN (3, (int) op.num_ops); i++) + { + tree vectype = NULL_TREE; + int used_ncopies = ncopies; + + if (cond_fn_p && i == 0) + { + /* For a conditional operation pass the truth type as mask + vectype. */ + gcc_assert (single_defuse_cycle && reduc_index > 0); + vectype = truth_type_for (vectype_in); + } - /* For single def-use cycles get one copy of the vectorized reduction - definition. */ - if (single_defuse_cycle) - { - gcc_assert (!slp_node); - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, - op.ops[reduc_index], - reduc_index == 0 ? &vec_oprnds0 - : (reduc_index == 1 ? &vec_oprnds1 - : &vec_oprnds2)); + if (i != reduc_index) + { + /* For non-reduction operand, deduce effictive copies that are + involved in vectorized def-use cycles based on the input + vectype of the reduction statement. */ + used_ncopies = stmt_ncopies; + } + else if (single_defuse_cycle) + { + /* For single def-use cycles get one copy of the vectorized + reduction definition. */ + used_ncopies = 1; + } + + if (slp_node) + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]); + else + vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies, + op.ops[i], &vec_oprnds[i], vectype); + + if (used_ncopies < ncopies) + vec_oprnds[i].safe_grow_cleared (ncopies); + } } + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); + tree def0; - FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) + FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0) { gimple *new_stmt; - tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; - if (masked_loop_p && !mask_by_cond_expr) + tree new_temp = NULL_TREE; + tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE }; + + if (!vop[0] || !vop[1]) + { + tree reduc_vop = vec_oprnds[reduc_index][i]; + + /* Insert trivial copy if no need to generate vectorized + statement. */ + gcc_assert (reduc_vop && stmt_ncopies < ncopies); + + new_stmt = gimple_build_assign (vec_dest, reduc_vop); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); + } + else if (masked_loop_p && !mask_by_cond_expr) { - /* No conditional ifns have been defined for dot-product yet. */ - gcc_assert (code != DOT_PROD_EXPR); + /* No conditional ifns have been defined for dot-product and sad + yet. */ + gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR); /* Make sure that the reduction accumulator is vop[0]. */ if (reduc_index == 1) @@ -8681,7 +8866,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, std::swap (vop[0], vop[1]); } tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, - vec_num * ncopies, vectype_in, i); + vec_num * stmt_ncopies, + stmt_vectype_in, i); gcall *call = gimple_build_call_internal (cond_fn, 4, mask, vop[0], vop[1], vop[0]); new_temp = make_ssa_name (vec_dest, call); @@ -8693,12 +8879,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo, else { if (op.num_ops >= 3) - vop[2] = vec_oprnds2[i]; + vop[2] = vec_oprnds[2][i]; if (masked_loop_p && mask_by_cond_expr) { tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, - vec_num * ncopies, vectype_in, i); + vec_num * stmt_ncopies, + stmt_vectype_in, i); build_vect_cond_expr (code, vop, mask, gsi); } @@ -8725,16 +8912,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, if (slp_node) slp_node->push_vec_def (new_stmt); - else if (single_defuse_cycle - && i < ncopies - 1) - { - if (reduc_index == 0) - vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); - else if (reduc_index == 1) - vec_oprnds1.safe_push (gimple_get_lhs (new_stmt)); - else if (reduc_index == 2) - vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); - } + else if (single_defuse_cycle && i < ncopies - 1) + vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt); else STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); } diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index dbdb59054e0..81036235a27 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -13357,6 +13357,8 @@ vect_analyze_stmt (vec_info *vinfo, NULL, NULL, node, cost_vec) || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec) || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec) + || vectorizable_lane_reducing (as_a (vinfo), + stmt_info, node, cost_vec) || vectorizable_reduction (as_a (vinfo), stmt_info, node, node_instance, cost_vec) || vectorizable_induction (as_a (vinfo), stmt_info, diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 6bb0f5c3a56..3f7db707d97 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *, extern bool vectorizable_live_operation (vec_info *, stmt_vec_info, slp_tree, slp_instance, int, bool, stmt_vector_for_cost *); +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info, + slp_tree, stmt_vector_for_cost *); extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info, slp_tree, slp_instance, stmt_vector_for_cost *); -- 2.17.1 ________________________________________ From: Richard Biener Sent: Tuesday, June 4, 2024 9:17 PM To: Feng Xue OS Cc: Tamar Christina; gcc-patches@gcc.gnu.org Subject: Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440] On Sun, Jun 2, 2024 at 4:13?PM Feng Xue OS wrote: > > Please see my comments below. > > Thanks, > Feng > > > On Thu, May 30, 2024 at 4:55?PM Feng Xue OS wrote: > >> > >> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current > >> vectorizer could only handle the pattern if the reduction chain does not > >> contain other operation, no matter the other is normal or lane-reducing. > >> > >> Actually, to allow multiple arbitray lane-reducing operations, we need to > >> support vectorization of loop reduction chain with mixed input vectypes. Since > >> lanes of vectype may vary with operation, the effective ncopies of vectorized > >> statements for operation also may not be same to each other, this causes > >> mismatch on vectorized def-use cycles. A simple way is to align all operations > >> with the one that has the most ncopies, the gap could be complemented by > >> generating extra trival pass-through copies. For example: > >> > >> int sum = 0; > >> for (i) > >> { > >> sum += d0[i] * d1[i]; // dot-prod > >> sum += w[i]; // widen-sum > >> sum += abs(s0[i] - s1[i]); // sad > >> sum += n[i]; // normal > >> } > >> > >> The vector size is 128-bit?vectorization factor is 16. Reduction statements > >> would be transformed as: > >> > >> vector<4> int sum_v0 = { 0, 0, 0, 0 }; > >> vector<4> int sum_v1 = { 0, 0, 0, 0 }; > >> vector<4> int sum_v2 = { 0, 0, 0, 0 }; > >> vector<4> int sum_v3 = { 0, 0, 0, 0 }; > >> > >> for (i / 16) > >> { > >> sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); > >> sum_v1 = sum_v1; // copy > >> sum_v2 = sum_v2; // copy > >> sum_v3 = sum_v3; // copy > >> > >> sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); > >> sum_v1 = sum_v1; // copy > >> sum_v2 = sum_v2; // copy > >> sum_v3 = sum_v3; // copy > >> > >> sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); > >> sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); > >> sum_v2 = sum_v2; // copy > >> sum_v3 = sum_v3; // copy > >> > >> sum_v0 += n_v0[i: 0 ~ 3 ]; > >> sum_v1 += n_v1[i: 4 ~ 7 ]; > >> sum_v2 += n_v2[i: 8 ~ 11]; > >> sum_v3 += n_v3[i: 12 ~ 15]; > >> } > >> > >> Thanks, > >> Feng > >> > >> ... > >> > >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > >> index 20c99f11e9a..b5849dbb08a 100644 > >> --- a/gcc/tree-vect-loop.cc > >> +++ b/gcc/tree-vect-loop.cc > >> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, > >> if (!gimple_extract_op (orig_stmt_info->stmt, &op)) > >> gcc_unreachable (); > >> > >> - bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); > >> - > >> if (reduction_type == EXTRACT_LAST_REDUCTION) > >> /* No extra instructions are needed in the prologue. The loop body > >> operations are costed in vectorizable_condition. */ > >> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, > >> initial result of the data reduction, initial value of the index > >> reduction. */ > >> prologue_stmts = 4; > >> - else if (emulated_mixed_dot_prod) > >> - /* We need the initial reduction value and two invariants: > >> - one that contains the minimum signed value and one that > >> - contains half of its negative. */ > >> - prologue_stmts = 3; > >> else > >> + /* We need the initial reduction value. */ > >> prologue_stmts = 1; > >> prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, > >> scalar_to_vec, stmt_info, 0, > >> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info loop_vinfo, > >> } > >> } > >> > >> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in > >> + the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC. > >> + Now there are three such kinds of operations: dot-prod/widen-sum/sad > >> + (sum-of-absolute-differences). > >> + > >> + For a lane-reducing operation, the loop reduction path that it lies in, > >> + may contain normal operation, or other lane-reducing operation of different > >> + input type size, an example as: > >> + > >> + int sum = 0; > >> + for (i) > >> + { > >> + ... > >> + sum += d0[i] * d1[i]; // dot-prod > >> + sum += w[i]; // widen-sum > >> + sum += abs(s0[i] - s1[i]); // sad > >> + sum += n[i]; // normal > >> + ... > >> + } > >> + > >> + Vectorization factor is essentially determined by operation whose input > >> + vectype has the most lanes ("vector(16) char" in the example), while we > >> + need to choose input vectype with the least lanes ("vector(4) int" in the > >> + example) for the reduction PHI statement. */ > >> + > >> +bool > >> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, > >> + slp_tree slp_node, stmt_vector_for_cost *cost_vec) > >> +{ > >> + gassign *stmt = dyn_cast (stmt_info->stmt); > >> + if (!stmt) > >> + return false; > >> + > >> + enum tree_code code = gimple_assign_rhs_code (stmt); > >> + > >> + if (!lane_reducing_op_p (code)) > >> + return false; > > > > Can you make sure to return false if STMT_VINFO_REDUC_IDX == -1 > > thus the op is not part of a reduction chain/path? > > > > As I planed, in the 2nd stage patches WIP, this function will also handle > lane-reducing operation that does not directly participate reduction, like: > > temp = dot_prod1 + dot_prod2; > sum += temp; > > In this case, STMT_VINFO_REDUC_IDX of dot_prod1/2 == -1 > > For current work, the check is needed to filter out non-reduction statement, > but since it is expected to be removed later, so the check is placed at a late > point. > > >> + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); > >> + > >> + if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type)) > >> + return false; > >> + > >> + /* Do not try to vectorize bit-precision reductions. */ > >> + if (!type_has_mode_precision_p (type)) > >> + return false; > >> + > >> + tree vectype_in = NULL_TREE; > >> + > >> + for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++) > >> + { > >> + stmt_vec_info def_stmt_info; > >> + slp_tree slp_op; > >> + tree op; > >> + tree vectype; > >> + enum vect_def_type dt; > >> + > >> + if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op, > >> + &slp_op, &dt, &vectype, &def_stmt_info)) > >> + { > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> + "use not simple.\n"); > >> + return false; > >> + } > >> + > >> + if (!vectype) > >> + { > >> + vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op), > >> + slp_op); > >> + if (!vectype) > >> + return false; > >> + } > >> + > >> + if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype)) > > > > Please avoid this during transform. > > This function is only for analysis not transform. > > >> + { > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> + "incompatible vector types for invariants\n"); > >> + return false; > >> + } > >> + > >> + if (i == STMT_VINFO_REDUC_IDX (stmt_info)) > >> + continue; > >> + > >> + /* There should be at most one cycle def in the stmt. */ > >> + if (VECTORIZABLE_CYCLE_DEF (dt)) > >> + return false; > >> + > >> + /* To properly compute ncopies we are interested in the widest > >> + non-reduction input type in case we're looking at a widening > >> + accumulation that we later handle in vect transformation. */ > >> + if (!vectype_in > >> + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) > >> + < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))))) > >> + vectype_in = vectype; > >> + } > >> + > >> + STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; > > > > As said below I wonder where we would need STMT_VINFO_REDUC_VECTYPE_IN. > > At least you should avoid re-setting this when !cost_vec aka during transform, > > possibly instead asserting you re-compute the same type (or simply > > skip the above > > loop and set vectype_in from STMT_VINFO_REDUC_VECTYPE_IN which then > > gets a good use). > > Likewise. > > > > >> + stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); > >> + > >> + /* TODO: Support lane-reducing operation that does not directly participate > >> + in loop reduction. */ > >> + if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0) > >> + return false; > >> + > >> + /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not > >> + recoginized. */ > >> + gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def); > >> + gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION); > >> + > >> + tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); > >> + > >> + /* To accommodate lane-reducing operations of mixed input vectypes, choose > >> + input vectype with the least lanes for the reduction PHI statement, which > >> + would result in the most ncopies for vectorized reduction results. */ > >> + if (!vphi_vectype_in > >> + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) > >> + > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in))))) > >> + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > > > > Likewise. > > > >> + int ncopies_for_cost; > >> + > >> + if (slp_node) > >> + { > >> + /* Now lane-reducing operations in a slp node should only come from > >> + the same loop reduction path. */ > >> + gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info)); > >> + ncopies_for_cost = 1; > >> + } > >> + else > >> + { > >> + ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in); > >> + gcc_assert (ncopies_for_cost >= 1); > >> + } > >> + > >> + if (vect_is_emulated_mixed_dot_prod (stmt_info)) > >> + { > >> + /* We need extra two invariants: one that contains the minimum signed > >> + value and one that contains half of its negative. */ > >> + int prologue_stmts = 2; > >> + unsigned cost = record_stmt_cost (cost_vec, prologue_stmts, > >> + scalar_to_vec, stmt_info, 0, > >> + vect_prologue); > >> + if (dump_enabled_p ()) > >> + dump_printf (MSG_NOTE, "vectorizable_lane_reducing: " > >> + "extra prologue_cost = %d .\n", cost); > >> + > >> + /* Three dot-products and a subtraction. */ > >> + ncopies_for_cost *= 4; > >> + } > >> + > >> + record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0, > >> + vect_body); > >> + > >> + vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code, > >> + type, vectype_in); > >> + > >> + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; > > > > Uh, so those all go through vect_transform_reduction. I see. > > > > I fail to see a check for whether the target supports the lane-reducing op. > > vectorizable_reduction only checks the last one. Currently the check > > might be redundant with what pattern recognition checks but it's still > > incomplete compared to the check in vectorizable_reduction. > > In the original vectorizable_reduction, the target support check is deliberately > skipped for lane-reducing operations. The reason is part as you said, moreover, > other check would always not be executed. > > if (single_defuse_cycle || lane_reduc_code_p) > { > gcc_assert (op.code != COND_EXPR); > > /* 4. Supportable by target? */ > bool ok = true; > > /* 4.1. check support for the operation in the loop > > This isn't necessary for the lane reduction codes, since they > can only be produced by pattern matching, and it's up to the > pattern matcher to test for support. The main reason for > specifically skipping this step is to avoid rechecking whether > mixed-sign dot-products can be implemented using signed > dot-products. */ > machine_mode vec_mode = TYPE_MODE (vectype_in); > if (!lane_reduc_code_p //<----------- skip > && !directly_supported_p (op.code, vectype_in, optab_vector)) > { > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "op not supported by target.\n"); > if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) > || !vect_can_vectorize_without_simd_p (op.code)) > ok = false; > else > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "proceeding using word mode.\n"); > } > > // <----- always false for lane-reducing op > > if (vect_emulated_vector_p (vectype_in) > && !vect_can_vectorize_without_simd_p (op.code)) > { > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "using word mode not possible.\n"); > return false; > } > > > > >> + return true; > >> +} > >> + > >> /* Function vectorizable_reduction. > >> > >> Check if STMT_INFO performs a reduction operation that can be vectorized. > >> @@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> (gimple_bb (reduc_def_phi)->loop_father)); > >> unsigned reduc_chain_length = 0; > >> bool only_slp_reduc_chain = true; > >> + bool only_lane_reducing = true; > >> stmt_info = NULL; > >> slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; > >> while (reduc_def != PHI_RESULT (reduc_def_phi)) > >> @@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> return false; > >> } > >> } > >> - else if (!stmt_info) > >> - /* First non-conversion stmt. */ > >> - stmt_info = vdef; > >> + else > >> + { > >> + /* First non-conversion stmt. */ > >> + if (!stmt_info) > >> + stmt_info = vdef; > >> + > >> + if (!lane_reducing_op_p (op.code)) > >> + only_lane_reducing = false; > >> + } > >> + > >> reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; > >> reduc_chain_length++; > >> if (!stmt_info && slp_node) > >> @@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> if (!type_has_mode_precision_p (op.type)) > >> return false; > >> > >> - /* For lane-reducing ops we're reducing the number of reduction PHIs > >> - which means the only use of that may be in the lane-reducing operation. */ > >> - if (lane_reducing > >> - && reduc_chain_length != 1 > >> - && !only_slp_reduc_chain) > >> - { > >> - if (dump_enabled_p ()) > >> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> - "lane-reducing reduction with extra stmts.\n"); > >> - return false; > >> - } > >> - > >> /* Lane-reducing ops also never can be used in a SLP reduction group > >> since we'll mix lanes belonging to different reductions. But it's > >> OK to use them in a reduction chain or when the reduction group > >> @@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> "use not simple.\n"); > >> return false; > >> } > >> - if (i == STMT_VINFO_REDUC_IDX (stmt_info)) > >> - continue; > >> - > > > > So within this loop we analyze the "main" operation, while I do not exactly > > remember why we skip the op leading to the PHI I don't understand why you > > want to look at it for the multi lane-reducing case (the accumulator > > always has the same type, no?). > > > > In any case this just looks at a single (the last) lane-reducing or even > > not lane-reducing op. > > > > This comparison is redundant, since it could be covered by the following > comparison statement. The change should have been placed to a separate > patch, but for convenience I made it here. > > /* For an IFN_COND_OP we might hit the reduction definition operand > twice (once as definition, once as else). */ > if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) > continue; > > /* There should be only one cycle def in the stmt, the one > leading to reduc_def. */ > if (VECTORIZABLE_CYCLE_DEF (dt)) > return false; > > >> /* For an IFN_COND_OP we might hit the reduction definition operand > >> twice (once as definition, once as else). */ > >> if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) > >> @@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> } > >> if (!vectype_in) > >> vectype_in = STMT_VINFO_VECTYPE (phi_info); > >> - STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > >> > >> - /* Each lane-reducing operation has its own input vectype, while reduction > >> - PHI records the input vectype with least lanes. */ > >> - if (lane_reducing) > >> - STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; > >> - > >> - enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); > >> - STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; > >> + /* If there is a normal (non-lane-reducing) operation in the loop reduction > >> + path, to ensure there will be enough copies to hold vectorized results of > >> + the operation, we need set the input vectype of the reduction PHI to be > >> + same as the reduction output vectype somewhere, here is a suitable place. > >> + Otherwise the input vectype is set to the one with the least lanes, which > >> + can only be determined in vectorizable analysis routine of lane-reducing > >> + operation. */ > > > > But we are using vectype_in to compute ncopies which is used in cost analysis. > > The vectype_in only impacts the cost analysis for lane-reducing op, since the > function vect_is_emulated_mixed_dot_prod need it, and this function is referred > by cost analysis. In the previous patch, we bind the vectype_in to each > lane-reducing op and also adjust code of the function accordingly, then this > would not be a problem. > > > You say this might not be the final ncopies? Note the vectorization factor is > > already fixed as well as (output) vector types of the lane-reducing ops. So > > The vectype_in is incrementally updated during analyzing vectorizablility of > lane-reducing ops. So before transform, the type should be determined. > > > shouldn't we simply pick that up in the loop walking the use-def chain via > > REDUC_IDX at the start of this function? > > I thought about doing it in that way. Ok. will consider it again. > > > I'm unsure as to why we need > > STMT_VINFO_REDUC_VECTYPE_IN at all (I don't remember adding that), > > it should be readily available from operand analysis. The docs for that > > isn't very enlightening either (there's also REDUC_VECTYPE, in addition > > to VECTYPE - huh). > > For old code, in which only one lane-reducing op is allowed in loop > reduction, this type might be computed on-demand. > > But for multiple lane-reducing ops, we need to know the vectype_in types > of all ops in order to determine a proper vectype_in for PHI statement, if > traversing those ops and computing types on-demand would not a good > way. Additionally, during transform, originally cfg flow is broken and could > not be used. > > >> + if (!only_lane_reducing) > >> + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info); > >> + > >> + enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info); > >> + STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type; > >> /* If we have a condition reduction, see if we can simplify it further. */ > >> - if (v_reduc_type == COND_REDUCTION) > >> + if (reduction_type == COND_REDUCTION) > >> { > >> if (slp_node) > >> return false; > >> @@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> } > >> > >> STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; > >> + reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); > >> > >> - vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); > >> if (reduction_type == TREE_CODE_REDUCTION) > >> { > >> /* Check whether it's ok to change the order of the computation. > >> @@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> && loop_vinfo->suggested_unroll_factor == 1) > >> single_defuse_cycle = true; > >> > >> - if (single_defuse_cycle || lane_reducing) > >> + if (single_defuse_cycle && !lane_reducing) > >> { > >> gcc_assert (op.code != COND_EXPR); > >> > >> - /* 4. Supportable by target? */ > >> - bool ok = true; > >> - > >> - /* 4.1. check support for the operation in the loop > >> + /* 4. check support for the operation in the loop > >> > >> This isn't necessary for the lane reduction codes, since they > >> can only be produced by pattern matching, and it's up to the > >> @@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> mixed-sign dot-products can be implemented using signed > >> dot-products. */ > >> machine_mode vec_mode = TYPE_MODE (vectype_in); > >> - if (!lane_reducing > >> - && !directly_supported_p (op.code, vectype_in, optab_vector)) > >> + if (!directly_supported_p (op.code, vectype_in, optab_vector)) > >> { > >> if (dump_enabled_p ()) > >> dump_printf (MSG_NOTE, "op not supported by target.\n"); > >> if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) > >> || !vect_can_vectorize_without_simd_p (op.code)) > >> - ok = false; > >> + single_defuse_cycle = false; > >> else > >> if (dump_enabled_p ()) > >> dump_printf (MSG_NOTE, "proceeding using word mode.\n"); > >> @@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> dump_printf (MSG_NOTE, "using word mode not possible.\n"); > >> return false; > >> } > >> - > >> - /* lane-reducing operations have to go through vect_transform_reduction. > >> - For the other cases try without the single cycle optimization. */ > >> - if (!ok) > >> - { > >> - if (lane_reducing) > >> - return false; > >> - else > >> - single_defuse_cycle = false; > >> - } > >> } > >> STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; > >> > >> - /* If the reduction stmt is one of the patterns that have lane > >> - reduction embedded we cannot handle the case of ! single_defuse_cycle. */ > >> - if ((ncopies > 1 && ! single_defuse_cycle) > >> - && lane_reducing) > >> - { > >> - if (dump_enabled_p ()) > >> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> - "multi def-use cycle not possible for lane-reducing " > >> - "reduction operation\n"); > >> - return false; > >> - } > >> - > >> - if (slp_node > >> - && !(!single_defuse_cycle > >> - && !lane_reducing > >> - && reduction_type != FOLD_LEFT_REDUCTION)) > >> + /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the > >> + below processing will be done in its own vectorizable function. */ > >> + if (slp_node && reduction_type == FOLD_LEFT_REDUCTION) > >> for (i = 0; i < (int) op.num_ops; i++) > >> if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) > >> { > >> @@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, > >> reduction_type, ncopies, cost_vec); > >> /* Cost the reduction op inside the loop if transformed via > >> - vect_transform_reduction. Otherwise this is costed by the > >> - separate vectorizable_* routines. */ > >> - if (single_defuse_cycle || lane_reducing) > >> - { > >> - int factor = 1; > >> - if (vect_is_emulated_mixed_dot_prod (stmt_info)) > >> - /* Three dot-products and a subtraction. */ > >> - factor = 4; > >> - record_stmt_cost (cost_vec, ncopies * factor, vector_stmt, > >> - stmt_info, 0, vect_body); > >> - } > >> + vect_transform_reduction for non-lane-reducing operation. Otherwise > >> + this is costed by the separate vectorizable_* routines. */ > >> + if (single_defuse_cycle && !lane_reducing) > >> + record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body); > >> > >> if (dump_enabled_p () > >> && reduction_type == FOLD_LEFT_REDUCTION) > >> dump_printf_loc (MSG_NOTE, vect_location, > >> "using an in-order (fold-left) reduction.\n"); > >> STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; > >> - /* All but single defuse-cycle optimized, lane-reducing and fold-left > >> - reductions go through their own vectorizable_* routines. */ > >> - if (!single_defuse_cycle > >> - && !lane_reducing > >> - && reduction_type != FOLD_LEFT_REDUCTION) > >> + > >> + /* All but single defuse-cycle optimized and fold-left reductions go > >> + through their own vectorizable_* routines. */ > >> + if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION) > >> + || lane_reducing) > > > > > So single-def-use-cycle but lane-reducing ops no longer need > > to go through vect_transform_reduction? How do you handle those > > but fail to handle non-lane-reducing ops this way? > > Emm, all kinds of lane-reducing ops will go into vectorizable_lane_reducing(), > no matter it is single-def-use or not, at that function, the STMT_VINFO_TYPE > is set to reduc_vec_info_type, so transform will be done inside > vect_transform_reduction. > > > > >> { > >> stmt_vec_info tem > >> = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); > >> @@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > >> int i; > >> int ncopies; > >> + int stmt_ncopies; > >> int vec_num; > >> > >> stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); > >> @@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> gphi *reduc_def_phi = as_a (phi_info->stmt); > >> int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); > >> tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); > >> + tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); > >> + > >> + /* Get input vectypes from the reduction PHI and the statement to be > >> + transformed, these two vectypes may have different lanes when > >> + lane-reducing operation is present. */ > >> + if (!vectype_in) > >> + vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info); > >> + > >> + if (!stmt_vectype_in) > >> + stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info); > >> > >> if (slp_node) > >> { > >> ncopies = 1; > >> + stmt_ncopies = 1; > >> vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > >> } > >> else > >> { > >> ncopies = vect_get_num_copies (loop_vinfo, vectype_in); > >> + stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in); > >> + gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies); > >> vec_num = 1; > >> } > >> > >> @@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> > >> vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > >> vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > >> - bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); > >> - > >> + bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, > >> + stmt_vectype_in); > >> /* Transform. */ > >> - tree new_temp = NULL_TREE; > >> - auto_vec vec_oprnds0; > >> - auto_vec vec_oprnds1; > >> - auto_vec vec_oprnds2; > >> - tree def0; > >> + auto_vec vec_oprnds[3]; > >> > >> if (dump_enabled_p ()) > >> dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); > >> @@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> == op.ops[internal_fn_else_index ((internal_fn) code)])); > >> } > >> > >> - bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > >> - > >> vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); > >> if (reduction_type == FOLD_LEFT_REDUCTION) > >> { > >> @@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> gcc_assert (code.is_tree_code () || cond_fn_p); > >> return vectorize_fold_left_reduction > >> (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, > >> - code, reduc_fn, op.ops, op.num_ops, vectype_in, > >> + code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in, > >> reduc_index, masks, lens); > >> } > >> > >> @@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> tree scalar_dest = gimple_get_lhs (stmt_info->stmt); > >> tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); > >> > >> - /* Get NCOPIES vector definitions for all operands except the reduction > >> - definition. */ > >> - if (!cond_fn_p) > >> + gcc_assert (reduc_index < 3); > >> + > >> + if (slp_node) > >> { > >> - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, > >> - single_defuse_cycle && reduc_index == 0 > >> - ? NULL_TREE : op.ops[0], &vec_oprnds0, > >> - single_defuse_cycle && reduc_index == 1 > >> - ? NULL_TREE : op.ops[1], &vec_oprnds1, > >> - op.num_ops == 3 > >> - && !(single_defuse_cycle && reduc_index == 2) > >> - ? op.ops[2] : NULL_TREE, &vec_oprnds2); > >> + gcc_assert (!single_defuse_cycle && op.num_ops <= 3); > > > > I think that's going to fail. Mind v3 of the series I posted to enable > > SLP discovery for single-lane reductions. Basically everything is > > going to be SLP for GCC 15. > > > > Have the v3 already been in the trunk? Then by default, any statement that has > no isomorphic partner will become a single-lane SLP node? And for such node, > can I just reuse the old non-SLP transformation code? As of this morning, r15-1006-gd93353e6423eca, it is on trunk. Note the fallback is still non-SLP in case vectorizable_reduction FAILs with SLP. I have a set of changes queued to allow some more kind of reductions with SLP but IIRC the lane-reducing variant is already supported. Richard. > >> + > >> + for (i = 0; i < (int) op.num_ops; i++) > >> + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]); > >> } > >> else > >> { > >> - /* For a conditional operation pass the truth type as mask > >> - vectype. */ > >> - gcc_assert (single_defuse_cycle > >> - && (reduc_index == 1 || reduc_index == 2)); > >> - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, > >> - op.ops[0], truth_type_for (vectype_in), &vec_oprnds0, > >> - reduc_index == 1 ? NULL_TREE : op.ops[1], > >> - NULL_TREE, &vec_oprnds1, > >> - reduc_index == 2 ? NULL_TREE : op.ops[2], > >> - NULL_TREE, &vec_oprnds2); > >> - } > >> + /* The input vectype of the reduction PHI determines copies of > >> + vectorized def-use cycles, which might be more than effective copies > >> + of vectorized lane-reducing reduction statements. This could be > >> + complemented by generating extra trivial pass-through copies. For > >> + example: > >> + > > > > That also means you need to handle SLP here, but you can assert there's > > only a single lane. > > > > Btw, you can push the patches I approved if they independently test OK. > > > > >> + int sum = 0; > >> + for (i) > >> + { > >> + sum += d0[i] * d1[i]; // dot-prod > >> + sum += abs(s0[i] - s1[i]); // sad > >> + sum += n[i]; // normal > >> + } > >> + > >> + The vector size is 128-bit?vectorization factor is 16. Reduction > >> + statements would be transformed as: > >> + > >> + vector<4> int sum_v0 = { 0, 0, 0, 0 }; > >> + vector<4> int sum_v1 = { 0, 0, 0, 0 }; > >> + vector<4> int sum_v2 = { 0, 0, 0, 0 }; > >> + vector<4> int sum_v3 = { 0, 0, 0, 0 }; > >> + > >> + for (i / 16) > >> + { > >> + sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); > >> + sum_v1 = sum_v1; // copy > >> + sum_v2 = sum_v2; // copy > >> + sum_v3 = sum_v3; // copy > >> + > >> + sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); > >> + sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); > >> + sum_v2 = sum_v2; // copy > >> + sum_v3 = sum_v3; // copy > >> + > >> + sum_v0 += n_v0[i: 0 ~ 3 ]; > >> + sum_v1 += n_v1[i: 4 ~ 7 ]; > >> + sum_v2 += n_v2[i: 8 ~ 11]; > >> + sum_v3 += n_v3[i: 12 ~ 15]; > >> + } > >> + */ > >> + > >> + for (i = 0; i < MIN (3, (int) op.num_ops); i++) > >> + { > >> + tree vectype = NULL_TREE; > >> + int used_ncopies = ncopies; > >> + > >> + if (cond_fn_p && i == 0) > >> + { > >> + /* For a conditional operation pass the truth type as mask > >> + vectype. */ > >> + gcc_assert (single_defuse_cycle && reduc_index > 0); > >> + vectype = truth_type_for (vectype_in); > >> + } > >> > >> - /* For single def-use cycles get one copy of the vectorized reduction > >> - definition. */ > >> - if (single_defuse_cycle) > >> - { > >> - gcc_assert (!slp_node); > >> - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, > >> - op.ops[reduc_index], > >> - reduc_index == 0 ? &vec_oprnds0 > >> - : (reduc_index == 1 ? &vec_oprnds1 > >> - : &vec_oprnds2)); > >> + if (i != reduc_index) > >> + { > >> + /* For non-reduction operand, deduce effictive copies that are > >> + involved in vectorized def-use cycles based on the input > >> + vectype of the reduction statement. */ > >> + used_ncopies = stmt_ncopies; > >> + } > >> + else if (single_defuse_cycle) > >> + { > >> + /* For single def-use cycles get one copy of the vectorized > >> + reduction definition. */ > >> + used_ncopies = 1; > >> + } > >> + > >> + vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies, > >> + op.ops[i], &vec_oprnds[i], vectype); > >> + > >> + if (used_ncopies < ncopies) > >> + vec_oprnds[i].safe_grow_cleared (ncopies); > >> + } > >> } > >> > >> + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > >> bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); > >> + tree def0; > >> > >> - FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) > >> + FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0) > >> { > >> gimple *new_stmt; > >> - tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; > >> - if (masked_loop_p && !mask_by_cond_expr) > >> + tree new_temp = NULL_TREE; > >> + tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE }; > >> + > >> + if (!vop[0] || !vop[1]) > >> + { > >> + tree reduc_vop = vec_oprnds[reduc_index][i]; > >> + > >> + /* Insert trivial copy if no need to generate vectorized > >> + statement. */ > >> + gcc_assert (reduc_vop && stmt_ncopies < ncopies); > >> + > >> + new_stmt = gimple_build_assign (vec_dest, reduc_vop); > >> + new_temp = make_ssa_name (vec_dest, new_stmt); > >> + gimple_set_lhs (new_stmt, new_temp); > >> + vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); > >> + } > >> + else if (masked_loop_p && !mask_by_cond_expr) > >> { > >> - /* No conditional ifns have been defined for dot-product yet. */ > >> - gcc_assert (code != DOT_PROD_EXPR); > >> + /* No conditional ifns have been defined for dot-product and sad > >> + yet. */ > >> + gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR); > >> > >> /* Make sure that the reduction accumulator is vop[0]. */ > >> if (reduc_index == 1) > >> @@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> std::swap (vop[0], vop[1]); > >> } > >> tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, > >> - vec_num * ncopies, vectype_in, i); > >> + vec_num * stmt_ncopies, > >> + stmt_vectype_in, i); > >> gcall *call = gimple_build_call_internal (cond_fn, 4, mask, > >> vop[0], vop[1], vop[0]); > >> new_temp = make_ssa_name (vec_dest, call); > >> @@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> else > >> { > >> if (op.num_ops >= 3) > >> - vop[2] = vec_oprnds2[i]; > >> + vop[2] = vec_oprnds[2][i]; > >> > >> if (masked_loop_p && mask_by_cond_expr) > >> { > >> tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, > >> - vec_num * ncopies, vectype_in, i); > >> + vec_num * stmt_ncopies, > >> + stmt_vectype_in, i); > >> build_vect_cond_expr (code, vop, mask, gsi); > >> } > >> > >> @@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> > >> if (slp_node) > >> slp_node->push_vec_def (new_stmt); > >> - else if (single_defuse_cycle > >> - && i < ncopies - 1) > >> - { > >> - if (reduc_index == 0) > >> - vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); > >> - else if (reduc_index == 1) > >> - vec_oprnds1.safe_push (gimple_get_lhs (new_stmt)); > >> - else if (reduc_index == 2) > >> - vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); > >> - } > >> + else if (single_defuse_cycle && i < ncopies - 1) > >> + vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt); > >> else > >> STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > >> } > >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > >> index 2e0be763abb..cc0a832f71b 100644 > >> --- a/gcc/tree-vect-stmts.cc > >> +++ b/gcc/tree-vect-stmts.cc > >> @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo, > >> NULL, NULL, node, cost_vec) > >> || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec) > >> || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec) > >> + || vectorizable_lane_reducing (as_a (vinfo), > >> + stmt_info, node, cost_vec) > >> || vectorizable_reduction (as_a (vinfo), stmt_info, > >> node, node_instance, cost_vec) > >> || vectorizable_induction (as_a (vinfo), stmt_info, > >> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > >> index 97ec9c341e7..ca810869592 100644 > >> --- a/gcc/tree-vectorizer.h > >> +++ b/gcc/tree-vectorizer.h > >> @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *, > >> extern bool vectorizable_live_operation (vec_info *, stmt_vec_info, > >> slp_tree, slp_instance, int, > >> bool, stmt_vector_for_cost *); > >> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info, > >> + slp_tree, stmt_vector_for_cost *); > >> extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info, > >> slp_tree, slp_instance, > >> stmt_vector_for_cost *); > >> -- > >> 2.17.1