From: Robin Dapp <rdapp@linux.ibm.com>
To: GCC Patches <gcc-patches@gcc.gnu.org>
Subject: [PATCH] s390: Recognize reverse/element swap permute patterns.
Date: Fri, 12 Aug 2022 12:57:27 +0200 [thread overview]
Message-ID: <cf841c57-f25e-c544-66f5-679f2c6b0516@linux.ibm.com> (raw)
Hi,
this adds functions to recognize reverse/element swap permute patterns
for vler, vster as well as vpdi and rotate.
Bootstrapped and regtested, no regressions.
Is it OK?
Regards
Robin
gcc/ChangeLog:
* config/s390/s390.cc (expand_perm_with_vpdi): Recognize swap pattern.
(is_reverse_perm_mask): New function.
(expand_perm_with_rot): Recognize reverse pattern.
(expand_perm_with_vster): Use vler/vster for element reversal on z15.
(s390_vectorize_vec_perm_const): Add expand functions.
* config/s390/vx-builtins.md: PreferThis adds functions to recognize
reverse/element swap permute patterns
for vler, vster as well as vpdi and rotate.
gcc/ChangeLog:
* config/s390/s390.cc (expand_perm_with_vpdi): Recognize swap pattern.
(is_reverse_perm_mask): New function.
(expand_perm_with_rot): Recognize reverse pattern.
(expand_perm_with_vster): Use vler/vster for element reversal on z15.
(s390_vectorize_vec_perm_const): Add expand functions.
* config/s390/vx-builtins.md: Prefer vster over vler.
* config/s390/s390.cc (expand_perm_with_vstbrq): New function.
(vectorize_vec_perm_const_1): Use.
gcc/testsuite/ChangeLog:
* gcc.target/s390/vector/vperm-rev-z14.c: New test.
* gcc.target/s390/vector/vperm-rev-z15.c: New test.
* gcc.target/s390/zvector/vec-reve-store-byte.c: Adjust test
expectation.
---
gcc/config/s390/s390.cc | 102 ++++++++++++++-
gcc/config/s390/vx-builtins.md | 21 ++++
.../gcc.target/s390/vector/vperm-rev-z14.c | 87 +++++++++++++
.../gcc.target/s390/vector/vperm-rev-z15.c | 118 ++++++++++++++++++
.../s390/zvector/vec-reve-store-byte.c | 6 +-
5 files changed, 329 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
create mode 100644 gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 528cd8c7f0f6..c86b26933d7a 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -17225,10 +17225,15 @@ expand_perm_with_vpdi (const struct
expand_vec_perm_d &d)
if (d.nelt != 2)
return false;
+ /* If both operands are the same we can swap the elements
+ i.e. reverse the vector. */
+ bool same = d.op0 == d.op1;
+
if (d.perm[0] == 0 && d.perm[1] == 3)
vpdi1_p = true;
- if (d.perm[0] == 1 && d.perm[1] == 2)
+ if ((d.perm[0] == 1 && d.perm[1] == 2)
+ || (same && d.perm[0] == 1 && d.perm[1] == 0))
vpdi4_p = true;
if (!vpdi1_p && !vpdi4_p)
@@ -17249,6 +17254,92 @@ expand_perm_with_vpdi (const struct
expand_vec_perm_d &d)
return true;
}
+/* Helper that checks if a vector permutation mask D
+ represents a reversal of the vector's elements. */
+static inline bool
+is_reverse_perm_mask (const struct expand_vec_perm_d &d)
+{
+ for (int i = 0; i < d.nelt; i++)
+ if (d.perm[i] != d.nelt - i - 1)
+ return false;
+ return true;
+}
+
+/* The case of reversing a four-element vector [0, 1, 2, 3]
+ can be handled by first permuting the doublewords
+ [2, 3, 0, 1] and subsequently rotating them by 32 bits. */
+static bool
+expand_perm_with_rot (const struct expand_vec_perm_d &d)
+{
+ if (d.nelt != 4)
+ return false;
+
+ if (d.op0 == d.op1 && is_reverse_perm_mask (d))
+ {
+ if (d.testing_p)
+ return true;
+
+ rtx tmp = gen_reg_rtx (d.vmode);
+ rtx op0_reg = force_reg (GET_MODE (d.op0), d.op0);
+
+ emit_insn (gen_vpdi4_2 (d.vmode, tmp, op0_reg, op0_reg));
+ if (d.vmode == V4SImode)
+ emit_insn (gen_rotlv4si3_di (d.target, tmp));
+ else if (d.vmode == V4SFmode)
+ emit_insn (gen_rotlv4sf3_di (d.target, tmp));
+
+ return true;
+ }
+
+ return false;
+}
+
+/* If we just reverse the elements, emit an eltswap if we have
+ vler/vster. */
+static bool
+expand_perm_with_vster (const struct expand_vec_perm_d &d)
+{
+ if (TARGET_VXE2 && d.op0 == d.op1 && is_reverse_perm_mask (d)
+ && (d.vmode == V2DImode || d.vmode == V2DFmode
+ || d.vmode == V4SImode || d.vmode == V4SFmode
+ || d.vmode == V8HImode))
+ {
+ if (d.testing_p)
+ return true;
+
+ if (d.vmode == V2DImode)
+ emit_insn (gen_eltswapv2di (d.target, d.op0));
+ else if (d.vmode == V2DFmode)
+ emit_insn (gen_eltswapv2df (d.target, d.op0));
+ else if (d.vmode == V4SImode)
+ emit_insn (gen_eltswapv4si (d.target, d.op0));
+ else if (d.vmode == V4SFmode)
+ emit_insn (gen_eltswapv4sf (d.target, d.op0));
+ else if (d.vmode == V8HImode)
+ emit_insn (gen_eltswapv8hi (d.target, d.op0));
+ return true;
+ }
+ return false;
+}
+
+/* If we reverse a byte-vector this is the same as
+ byte reversing it which can be done with vstbrq. */
+static bool
+expand_perm_with_vstbrq (const struct expand_vec_perm_d &d)
+{
+ if (TARGET_VXE2 && d.op0 == d.op1 && is_reverse_perm_mask (d)
+ && d.vmode == V16QImode)
+ {
+ if (d.testing_p)
+ return true;
+
+ emit_insn (gen_eltswapv16qi (d.target, d.op0));
+ return true;
+ }
+ return false;
+}
+
+
/* Try to find the best sequence for the vector permute operation
described by D. Return true if the operation could be
expanded. */
@@ -17258,9 +17349,18 @@ vectorize_vec_perm_const_1 (const struct
expand_vec_perm_d &d)
if (expand_perm_with_merge (d))
return true;
+ if (expand_perm_with_vster (d))
+ return true;
+
+ if (expand_perm_with_vstbrq (d))
+ return true;
+
if (expand_perm_with_vpdi (d))
return true;
+ if (expand_perm_with_rot (d))
+ return true;
+
return false;
}
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 99c4c037b49a..22d0355ec219 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -2184,6 +2184,27 @@ (define_insn "*eltswap<mode>"
vster<bhfgq>\t%v1,%v0"
[(set_attr "op_type" "*,VRX,VRX")])
+; The emulation pattern below will also accept
+; vst (eltswap (vl))
+; i.e. both operands in memory, which reload needs to fix.
+; Split into
+; vl
+; vster (=vst (eltswap))
+; since we prefer vster over vler as long as the latter
+; does not support alignment hints.
+(define_split
+ [(set (match_operand:VEC_HW 0 "memory_operand" "")
+ (unspec:VEC_HW [(match_operand:VEC_HW 1 "memory_operand" "")]
+ UNSPEC_VEC_ELTSWAP))]
+ "TARGET_VXE2 && can_create_pseudo_p ()"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0)
+ (unspec:VEC_HW [(match_dup 2)] UNSPEC_VEC_ELTSWAP))]
+{
+ operands[2] = gen_reg_rtx (<MODE>mode);
+})
+
+
; Swapping v2df/v2di can be done via vpdi on z13 and z14.
(define_split
[(set (match_operand:V_HW_2 0 "register_operand" "")
diff --git a/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
new file mode 100644
index 000000000000..5c64fac4646c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
@@ -0,0 +1,87 @@
+/* Make sure that the reverse permute patterns are optimized
+ correctly. */
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O2 -march=z14 -mzarch -fno-unroll-loops" } */
+
+/* { dg-final { scan-assembler-times "vpdi\t" 4 } } */
+/* { dg-final { scan-assembler-times "verllg\t" 2 } } */
+
+#include <assert.h>
+
+__attribute__((noipa))
+void reversel (long long *restrict a, long long *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversed (double *restrict a, double *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversei (unsigned int *restrict a, unsigned int *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+__attribute__((noipa))
+void reversef (float *restrict a, float *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+int main()
+{
+ const int n = 1024;
+ unsigned int u[n], u2[n];
+ long long l[n], l2[n];
+ double d[n], d2[n];
+ float f[n], f2[n];
+
+ for (int i = 0; i < n; i++)
+ {
+ u[i] = i;
+ l[i] = i;
+ d[i] = i;
+ f[i] = i;
+ u2[i] = i;
+ l2[i] = i;
+ d2[i] = i;
+ f2[i] = i;
+ }
+
+ reversei (u2, u, n);
+ reversel (l2, l, n);
+ reversed (d2, d, n);
+ reversef (f2, f, n);
+
+ for (int i = 0; i < n - 16; i++)
+ {
+ assert (u[i] == u2[i / (16 / sizeof (u[0])) * (16 / sizeof
(u[0])) + 16 / sizeof (u[0]) - 1 - i % (16 / sizeof (u[0]))]);
+ assert (l[i] == l2[i / (16 / sizeof (l[0])) * (16 / sizeof
(l[0])) + 16 / sizeof (l[0]) - 1 - i % (16 / sizeof (l[0]))]);
+ assert (d[i] == d2[i / (16 / sizeof (d[0])) * (16 / sizeof
(d[0])) + 16 / sizeof (d[0]) - 1 - i % (16 / sizeof (d[0]))]);
+ assert (f[i] == f2[i / (16 / sizeof (f[0])) * (16 / sizeof
(f[0])) + 16 / sizeof (f[0]) - 1 - i % (16 / sizeof (f[0]))]);
+ }
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
new file mode 100644
index 000000000000..bff52406fa9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
@@ -0,0 +1,118 @@
+/* Make sure that the reverse permute patterns are optimized
+ correctly. */
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O2 -march=z15 -mzarch -fno-unroll-loops" } */
+
+/* { dg-final { scan-assembler-times "vsterg\t" 2 } } */
+/* { dg-final { scan-assembler-times "vsterf" 2 } } */
+/* { dg-final { scan-assembler-times "vstbrq\t" 1 } } */
+/* { dg-final { scan-assembler-times "vperm" 0 } } */
+
+#include <assert.h>
+
+__attribute__((noipa))
+void reversec (char *restrict a, char *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 16)
+ {
+ a[i + 0] = b[i + 15];
+ a[i + 1] = b[i + 14];
+ a[i + 2] = b[i + 13];
+ a[i + 3] = b[i + 12];
+ a[i + 4] = b[i + 11];
+ a[i + 5] = b[i + 10];
+ a[i + 6] = b[i + 9];
+ a[i + 7] = b[i + 8];
+ a[i + 8] = b[i + 7];
+ a[i + 9] = b[i + 6];
+ a[i + 10] = b[i + 5];
+ a[i + 11] = b[i + 4];
+ a[i + 12] = b[i + 3];
+ a[i + 13] = b[i + 2];
+ a[i + 14] = b[i + 1];
+ a[i + 15] = b[i + 0];
+ }
+}
+
+__attribute__((noipa))
+void reversel (long long *restrict a, long long *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversed (double *restrict a, double *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversei (unsigned int *restrict a, unsigned int *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+__attribute__((noipa))
+void reversef (float *restrict a, float *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+int main()
+{
+ const int n = 1024;
+ char c[n], c2[n];
+ unsigned int u[n], u2[n];
+ long long l[n], l2[n];
+ double d[n], d2[n];
+ float f[n], f2[n];
+
+ for (int i = 0; i < n; i++)
+ {
+ c[i] = i;
+ u[i] = i;
+ l[i] = i;
+ d[i] = i;
+ f[i] = i;
+ c2[i] = i;
+ u2[i] = i;
+ l2[i] = i;
+ d2[i] = i;
+ f2[i] = i;
+ }
+
+ reversec (c2, c, n);
+ reversei (u2, u, n);
+ reversel (l2, l, n);
+ reversed (d2, d, n);
+ reversef (f2, f, n);
+
+ for (int i = 0; i < n - 16; i++)
+ {
+ assert (c[i] == c2[i / (16 / sizeof (c[0])) * (16 / sizeof
(c[0])) + 16 / sizeof (c[0]) - 1 - i % (16 / sizeof (c[0]))]);
+ assert (u[i] == u2[i / (16 / sizeof (u[0])) * (16 / sizeof
(u[0])) + 16 / sizeof (u[0]) - 1 - i % (16 / sizeof (u[0]))]);
+ assert (l[i] == l2[i / (16 / sizeof (l[0])) * (16 / sizeof
(l[0])) + 16 / sizeof (l[0]) - 1 - i % (16 / sizeof (l[0]))]);
+ assert (d[i] == d2[i / (16 / sizeof (d[0])) * (16 / sizeof
(d[0])) + 16 / sizeof (d[0]) - 1 - i % (16 / sizeof (d[0]))]);
+ assert (f[i] == f2[i / (16 / sizeof (f[0])) * (16 / sizeof
(f[0])) + 16 / sizeof (f[0]) - 1 - i % (16 / sizeof (f[0]))]);
+ }
+}
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
b/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
index db8284b1f8ff..6c061c69fea0 100644
--- a/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
@@ -16,13 +16,11 @@ bar (signed char *target, vector signed char x)
vec_xst (vec_reve (x), 0, target);
}
-/* { dg-final { scan-assembler-times "vstbrq\t" 2 } } */
-
-/* mem -> mem: This becomes vlbrq + vst */
+/* mem -> mem: This becomes vl + vstbrq */
void
baz (vector signed char *target, vector signed char *x)
{
*target = vec_reve (*x);
}
-/* { dg-final { scan-assembler-times "vlbrq\t" 1 } } */
+/* { dg-final { scan-assembler-times "vstbrq\t" 3 } } */
--
2.31.1
vster over vler.
* config/s390/s390.cc (expand_perm_with_vstbrq): New function.
(vectorize_vec_perm_const_1): Use.
gcc/testsuite/ChangeLog:
* gcc.target/s390/vector/vperm-rev-z14.c: New test.
* gcc.target/s390/vector/vperm-rev-z15.c: New test.
* gcc.target/s390/zvector/vec-reve-store-byte.c: Adjust test
expectation.
---
gcc/config/s390/s390.cc | 102 ++++++++++++++-
gcc/config/s390/vx-builtins.md | 21 ++++
.../gcc.target/s390/vector/vperm-rev-z14.c | 87 +++++++++++++
.../gcc.target/s390/vector/vperm-rev-z15.c | 118 ++++++++++++++++++
.../s390/zvector/vec-reve-store-byte.c | 6 +-
5 files changed, 329 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
create mode 100644 gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 528cd8c7f0f6..c86b26933d7a 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -17225,10 +17225,15 @@ expand_perm_with_vpdi (const struct
expand_vec_perm_d &d)
if (d.nelt != 2)
return false;
+ /* If both operands are the same we can swap the elements
+ i.e. reverse the vector. */
+ bool same = d.op0 == d.op1;
+
if (d.perm[0] == 0 && d.perm[1] == 3)
vpdi1_p = true;
- if (d.perm[0] == 1 && d.perm[1] == 2)
+ if ((d.perm[0] == 1 && d.perm[1] == 2)
+ || (same && d.perm[0] == 1 && d.perm[1] == 0))
vpdi4_p = true;
if (!vpdi1_p && !vpdi4_p)
@@ -17249,6 +17254,92 @@ expand_perm_with_vpdi (const struct
expand_vec_perm_d &d)
return true;
}
+/* Helper that checks if a vector permutation mask D
+ represents a reversal of the vector's elements. */
+static inline bool
+is_reverse_perm_mask (const struct expand_vec_perm_d &d)
+{
+ for (int i = 0; i < d.nelt; i++)
+ if (d.perm[i] != d.nelt - i - 1)
+ return false;
+ return true;
+}
+
+/* The case of reversing a four-element vector [0, 1, 2, 3]
+ can be handled by first permuting the doublewords
+ [2, 3, 0, 1] and subsequently rotating them by 32 bits. */
+static bool
+expand_perm_with_rot (const struct expand_vec_perm_d &d)
+{
+ if (d.nelt != 4)
+ return false;
+
+ if (d.op0 == d.op1 && is_reverse_perm_mask (d))
+ {
+ if (d.testing_p)
+ return true;
+
+ rtx tmp = gen_reg_rtx (d.vmode);
+ rtx op0_reg = force_reg (GET_MODE (d.op0), d.op0);
+
+ emit_insn (gen_vpdi4_2 (d.vmode, tmp, op0_reg, op0_reg));
+ if (d.vmode == V4SImode)
+ emit_insn (gen_rotlv4si3_di (d.target, tmp));
+ else if (d.vmode == V4SFmode)
+ emit_insn (gen_rotlv4sf3_di (d.target, tmp));
+
+ return true;
+ }
+
+ return false;
+}
+
+/* If we just reverse the elements, emit an eltswap if we have
+ vler/vster. */
+static bool
+expand_perm_with_vster (const struct expand_vec_perm_d &d)
+{
+ if (TARGET_VXE2 && d.op0 == d.op1 && is_reverse_perm_mask (d)
+ && (d.vmode == V2DImode || d.vmode == V2DFmode
+ || d.vmode == V4SImode || d.vmode == V4SFmode
+ || d.vmode == V8HImode))
+ {
+ if (d.testing_p)
+ return true;
+
+ if (d.vmode == V2DImode)
+ emit_insn (gen_eltswapv2di (d.target, d.op0));
+ else if (d.vmode == V2DFmode)
+ emit_insn (gen_eltswapv2df (d.target, d.op0));
+ else if (d.vmode == V4SImode)
+ emit_insn (gen_eltswapv4si (d.target, d.op0));
+ else if (d.vmode == V4SFmode)
+ emit_insn (gen_eltswapv4sf (d.target, d.op0));
+ else if (d.vmode == V8HImode)
+ emit_insn (gen_eltswapv8hi (d.target, d.op0));
+ return true;
+ }
+ return false;
+}
+
+/* If we reverse a byte-vector this is the same as
+ byte reversing it which can be done with vstbrq. */
+static bool
+expand_perm_with_vstbrq (const struct expand_vec_perm_d &d)
+{
+ if (TARGET_VXE2 && d.op0 == d.op1 && is_reverse_perm_mask (d)
+ && d.vmode == V16QImode)
+ {
+ if (d.testing_p)
+ return true;
+
+ emit_insn (gen_eltswapv16qi (d.target, d.op0));
+ return true;
+ }
+ return false;
+}
+
+
/* Try to find the best sequence for the vector permute operation
described by D. Return true if the operation could be
expanded. */
@@ -17258,9 +17349,18 @@ vectorize_vec_perm_const_1 (const struct
expand_vec_perm_d &d)
if (expand_perm_with_merge (d))
return true;
+ if (expand_perm_with_vster (d))
+ return true;
+
+ if (expand_perm_with_vstbrq (d))
+ return true;
+
if (expand_perm_with_vpdi (d))
return true;
+ if (expand_perm_with_rot (d))
+ return true;
+
return false;
}
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 99c4c037b49a..22d0355ec219 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -2184,6 +2184,27 @@ (define_insn "*eltswap<mode>"
vster<bhfgq>\t%v1,%v0"
[(set_attr "op_type" "*,VRX,VRX")])
+; The emulation pattern below will also accept
+; vst (eltswap (vl))
+; i.e. both operands in memory, which reload needs to fix.
+; Split into
+; vl
+; vster (=vst (eltswap))
+; since we prefer vster over vler as long as the latter
+; does not support alignment hints.
+(define_split
+ [(set (match_operand:VEC_HW 0 "memory_operand" "")
+ (unspec:VEC_HW [(match_operand:VEC_HW 1 "memory_operand" "")]
+ UNSPEC_VEC_ELTSWAP))]
+ "TARGET_VXE2 && can_create_pseudo_p ()"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0)
+ (unspec:VEC_HW [(match_dup 2)] UNSPEC_VEC_ELTSWAP))]
+{
+ operands[2] = gen_reg_rtx (<MODE>mode);
+})
+
+
; Swapping v2df/v2di can be done via vpdi on z13 and z14.
(define_split
[(set (match_operand:V_HW_2 0 "register_operand" "")
diff --git a/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
new file mode 100644
index 000000000000..5c64fac4646c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
@@ -0,0 +1,87 @@
+/* Make sure that the reverse permute patterns are optimized
+ correctly. */
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O2 -march=z14 -mzarch -fno-unroll-loops" } */
+
+/* { dg-final { scan-assembler-times "vpdi\t" 4 } } */
+/* { dg-final { scan-assembler-times "verllg\t" 2 } } */
+
+#include <assert.h>
+
+__attribute__((noipa))
+void reversel (long long *restrict a, long long *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversed (double *restrict a, double *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversei (unsigned int *restrict a, unsigned int *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+__attribute__((noipa))
+void reversef (float *restrict a, float *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+int main()
+{
+ const int n = 1024;
+ unsigned int u[n], u2[n];
+ long long l[n], l2[n];
+ double d[n], d2[n];
+ float f[n], f2[n];
+
+ for (int i = 0; i < n; i++)
+ {
+ u[i] = i;
+ l[i] = i;
+ d[i] = i;
+ f[i] = i;
+ u2[i] = i;
+ l2[i] = i;
+ d2[i] = i;
+ f2[i] = i;
+ }
+
+ reversei (u2, u, n);
+ reversel (l2, l, n);
+ reversed (d2, d, n);
+ reversef (f2, f, n);
+
+ for (int i = 0; i < n - 16; i++)
+ {
+ assert (u[i] == u2[i / (16 / sizeof (u[0])) * (16 / sizeof
(u[0])) + 16 / sizeof (u[0]) - 1 - i % (16 / sizeof (u[0]))]);
+ assert (l[i] == l2[i / (16 / sizeof (l[0])) * (16 / sizeof
(l[0])) + 16 / sizeof (l[0]) - 1 - i % (16 / sizeof (l[0]))]);
+ assert (d[i] == d2[i / (16 / sizeof (d[0])) * (16 / sizeof
(d[0])) + 16 / sizeof (d[0]) - 1 - i % (16 / sizeof (d[0]))]);
+ assert (f[i] == f2[i / (16 / sizeof (f[0])) * (16 / sizeof
(f[0])) + 16 / sizeof (f[0]) - 1 - i % (16 / sizeof (f[0]))]);
+ }
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
new file mode 100644
index 000000000000..bff52406fa9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
@@ -0,0 +1,118 @@
+/* Make sure that the reverse permute patterns are optimized
+ correctly. */
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O2 -march=z15 -mzarch -fno-unroll-loops" } */
+
+/* { dg-final { scan-assembler-times "vsterg\t" 2 } } */
+/* { dg-final { scan-assembler-times "vsterf" 2 } } */
+/* { dg-final { scan-assembler-times "vstbrq\t" 1 } } */
+/* { dg-final { scan-assembler-times "vperm" 0 } } */
+
+#include <assert.h>
+
+__attribute__((noipa))
+void reversec (char *restrict a, char *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 16)
+ {
+ a[i + 0] = b[i + 15];
+ a[i + 1] = b[i + 14];
+ a[i + 2] = b[i + 13];
+ a[i + 3] = b[i + 12];
+ a[i + 4] = b[i + 11];
+ a[i + 5] = b[i + 10];
+ a[i + 6] = b[i + 9];
+ a[i + 7] = b[i + 8];
+ a[i + 8] = b[i + 7];
+ a[i + 9] = b[i + 6];
+ a[i + 10] = b[i + 5];
+ a[i + 11] = b[i + 4];
+ a[i + 12] = b[i + 3];
+ a[i + 13] = b[i + 2];
+ a[i + 14] = b[i + 1];
+ a[i + 15] = b[i + 0];
+ }
+}
+
+__attribute__((noipa))
+void reversel (long long *restrict a, long long *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversed (double *restrict a, double *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 2)
+ {
+ a[i + 1] = b[i + 0];
+ a[i + 0] = b[i + 1];
+ }
+}
+
+__attribute__((noipa))
+void reversei (unsigned int *restrict a, unsigned int *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+__attribute__((noipa))
+void reversef (float *restrict a, float *restrict b, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ a[i + 3] = b[i + 0];
+ a[i + 2] = b[i + 1];
+ a[i + 1] = b[i + 2];
+ a[i + 0] = b[i + 3];
+ }
+}
+
+int main()
+{
+ const int n = 1024;
+ char c[n], c2[n];
+ unsigned int u[n], u2[n];
+ long long l[n], l2[n];
+ double d[n], d2[n];
+ float f[n], f2[n];
+
+ for (int i = 0; i < n; i++)
+ {
+ c[i] = i;
+ u[i] = i;
+ l[i] = i;
+ d[i] = i;
+ f[i] = i;
+ c2[i] = i;
+ u2[i] = i;
+ l2[i] = i;
+ d2[i] = i;
+ f2[i] = i;
+ }
+
+ reversec (c2, c, n);
+ reversei (u2, u, n);
+ reversel (l2, l, n);
+ reversed (d2, d, n);
+ reversef (f2, f, n);
+
+ for (int i = 0; i < n - 16; i++)
+ {
+ assert (c[i] == c2[i / (16 / sizeof (c[0])) * (16 / sizeof
(c[0])) + 16 / sizeof (c[0]) - 1 - i % (16 / sizeof (c[0]))]);
+ assert (u[i] == u2[i / (16 / sizeof (u[0])) * (16 / sizeof
(u[0])) + 16 / sizeof (u[0]) - 1 - i % (16 / sizeof (u[0]))]);
+ assert (l[i] == l2[i / (16 / sizeof (l[0])) * (16 / sizeof
(l[0])) + 16 / sizeof (l[0]) - 1 - i % (16 / sizeof (l[0]))]);
+ assert (d[i] == d2[i / (16 / sizeof (d[0])) * (16 / sizeof
(d[0])) + 16 / sizeof (d[0]) - 1 - i % (16 / sizeof (d[0]))]);
+ assert (f[i] == f2[i / (16 / sizeof (f[0])) * (16 / sizeof
(f[0])) + 16 / sizeof (f[0]) - 1 - i % (16 / sizeof (f[0]))]);
+ }
+}
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
b/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
index db8284b1f8ff..6c061c69fea0 100644
--- a/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
@@ -16,13 +16,11 @@ bar (signed char *target, vector signed char x)
vec_xst (vec_reve (x), 0, target);
}
-/* { dg-final { scan-assembler-times "vstbrq\t" 2 } } */
-
-/* mem -> mem: This becomes vlbrq + vst */
+/* mem -> mem: This becomes vl + vstbrq */
void
baz (vector signed char *target, vector signed char *x)
{
*target = vec_reve (*x);
}
-/* { dg-final { scan-assembler-times "vlbrq\t" 1 } } */
+/* { dg-final { scan-assembler-times "vstbrq\t" 3 } } */
--
2.31.1
next reply other threads:[~2022-08-12 10:57 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-12 10:57 Robin Dapp [this message]
2022-08-22 15:10 ` Robin Dapp
2022-08-22 15:24 ` Andreas Krebbel
2022-08-31 7:54 ` Robin Dapp
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=cf841c57-f25e-c544-66f5-679f2c6b0516@linux.ibm.com \
--to=rdapp@linux.ibm.com \
--cc=gcc-patches@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).