* [PATCH 1/2] [ARM] PR68532: Fix up vuzp for big endian
2016-02-03 18:59 [ARM, PATCH v2 0/2] PR68532: Fix VZIP/VUZP recognition for big endian charles.baylis
@ 2016-02-03 18:59 ` charles.baylis
2016-02-08 11:42 ` Kyrill Tkachov
2016-02-03 18:59 ` [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition " charles.baylis
1 sibling, 1 reply; 10+ messages in thread
From: charles.baylis @ 2016-02-03 18:59 UTC (permalink / raw)
To: Ramana.Radhakrishnan, kyrylo.tkachov, richard.earnshaw
Cc: rearnsha, gcc-patches, michael.collison
From: Charles Baylis <charles.baylis@linaro.org>
gcc/ChangeLog:
2016-02-03 Charles Baylis <charles.baylis@linaro.org>
PR target/68532
* config/arm/arm.c (neon_endian_lane_map): New function.
(neon_vector_pair_endian_lane_map): New function.
(arm_evpc_neon_vuzp): Allow for big endian lane order.
* config/arm/arm_neon.h (vuzpq_s8): Adjust shuffle patterns for big
endian.
(vuzpq_s16): Likewise.
(vuzpq_s32): Likewise.
(vuzpq_f32): Likewise.
(vuzpq_u8): Likewise.
(vuzpq_u16): Likewise.
(vuzpq_u32): Likewise.
(vuzpq_p8): Likewise.
(vuzpq_p16): Likewise.
gcc/testsuite/ChangeLog:
2015-12-15 Charles Baylis <charles.baylis@linaro.org>
PR target/68532
* gcc.c-torture/execute/pr68532.c: New test.
Change-Id: Ifd35d79bd42825f05403a1b96d8f34ef0f21dac3
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index d8a2745..e9aa982 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -28208,6 +28208,35 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
arm_expand_vec_perm_1 (target, op0, op1, sel);
}
+/* map lane ordering between architectural lane order, and GCC lane order,
+ taking into account ABI. See comment above output_move_neon for details. */
+static int
+neon_endian_lane_map (machine_mode mode, int lane)
+{
+ if (BYTES_BIG_ENDIAN)
+ {
+ int nelems = GET_MODE_NUNITS (mode);
+ /* Reverse lane order. */
+ lane = (nelems - 1 - lane);
+ /* Reverse D register order, to match ABI. */
+ if (GET_MODE_SIZE (mode) == 16)
+ lane = lane ^ (nelems / 2);
+ }
+ return lane;
+}
+
+/* some permutations index into pairs of vectors, this is a helper function
+ to map indexes into those pairs of vectors. */
+static int
+neon_pair_endian_lane_map (machine_mode mode, int lane)
+{
+ int nelem = GET_MODE_NUNITS (mode);
+ if (BYTES_BIG_ENDIAN)
+ lane =
+ neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
+ return lane;
+}
+
/* Generate or test for an insn that supports a constant permutation. */
/* Recognize patterns for the VUZP insns. */
@@ -28218,14 +28247,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
unsigned int i, odd, mask, nelt = d->nelt;
rtx out0, out1, in0, in1;
rtx (*gen)(rtx, rtx, rtx, rtx);
+ int first_elem;
+ int swap;
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
- /* Note that these are little-endian tests. Adjust for big-endian later. */
- if (d->perm[0] == 0)
+ /* arm_expand_vec_perm_const_1 () helpfully swaps the operands for the
+ big endian pattern on 64 bit vectors, so we correct for that. */
+ swap = BYTES_BIG_ENDIAN && !d->one_vector_p
+ && GET_MODE_SIZE (d->vmode) == 8 ? d->nelt : 0;
+
+ first_elem = d->perm[neon_endian_lane_map (d->vmode, 0)] ^ swap;
+
+ if (first_elem == neon_endian_lane_map (d->vmode, 0))
odd = 0;
- else if (d->perm[0] == 1)
+ else if (first_elem == neon_endian_lane_map (d->vmode, 1))
odd = 1;
else
return false;
@@ -28233,8 +28270,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
for (i = 0; i < nelt; i++)
{
- unsigned elt = (i * 2 + odd) & mask;
- if (d->perm[i] != elt)
+ unsigned elt =
+ (neon_pair_endian_lane_map (d->vmode, i) * 2 + odd) & mask;
+ if ((d->perm[i] ^ swap) != neon_pair_endian_lane_map (d->vmode, elt))
return false;
}
@@ -28258,10 +28296,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
in0 = d->op0;
in1 = d->op1;
- if (BYTES_BIG_ENDIAN)
+ if (swap)
{
std::swap (in0, in1);
- odd = !odd;
}
out0 = d->target;
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 47816d5..2e014b6 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8741,9 +8741,9 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
int8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+ { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+ { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8759,9 +8759,9 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
int16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 9, 11, 13, 15, 1, 3, 5, 7 });
+ { 5, 7, 1, 3, 13, 15, 9, 11 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 10, 12, 14, 0, 2, 4, 6 });
+ { 4, 6, 0, 2, 12, 14, 8, 10 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 2, 4, 6, 8, 10, 12, 14 });
@@ -8776,8 +8776,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8790,8 +8790,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8805,9 +8805,9 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
uint8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+ { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+ { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8823,9 +8823,9 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
uint16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 9, 11, 13, 15, 1, 3, 5, 7 });
+ { 5, 7, 1, 3, 13, 15, 9, 11 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 10, 12, 14, 0, 2, 4, 6 });
+ { 4, 6, 0, 2, 12, 14, 8, 10 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 2, 4, 6, 8, 10, 12, 14 });
@@ -8840,8 +8840,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8855,9 +8855,9 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
poly8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+ { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+ { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8873,9 +8873,9 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
poly16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 9, 11, 13, 15, 1, 3, 5, 7 });
+ { 5, 7, 1, 3, 13, 15, 9, 11 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 10, 12, 14, 0, 2, 4, 6 });
+ { 4, 6, 0, 2, 12, 14, 8, 10 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 2, 4, 6, 8, 10, 12, 14 });
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr68532.c b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
new file mode 100644
index 0000000..3c40aa8
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
@@ -0,0 +1,24 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#define SIZE 128
+unsigned short _Alignas (16) in[SIZE];
+
+extern void abort (void);
+
+__attribute__ ((noinline)) int
+test (unsigned short sum, unsigned short *in, int x)
+{
+ for (int j = 0; j < SIZE; j += 8)
+ sum += in[j] * x;
+ return sum;
+}
+
+int
+main ()
+{
+ for (int i = 0; i < SIZE; i++)
+ in[i] = i;
+ if (test (0, in, 1) != 960)
+ abort ();
+ return 0;
+}
--
1.9.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* [ARM, PATCH v2 0/2] PR68532: Fix VZIP/VUZP recognition for big endian
@ 2016-02-03 18:59 charles.baylis
2016-02-03 18:59 ` [PATCH 1/2] [ARM] PR68532: Fix up vuzp " charles.baylis
2016-02-03 18:59 ` [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition " charles.baylis
0 siblings, 2 replies; 10+ messages in thread
From: charles.baylis @ 2016-02-03 18:59 UTC (permalink / raw)
To: Ramana.Radhakrishnan, kyrylo.tkachov, richard.earnshaw
Cc: rearnsha, gcc-patches, michael.collison
From: Charles Baylis <charles.baylis@linaro.org>
This is an updated patch, which fixes the following issues:
. big endian ICE with vshuf-* tests
. style issues reported by check_GNU_style.sh
This has no regressions with -mfpu=neon, for arm-unknown-linux-gnueabihf and
armeb-unknown-linux-gnueabihf. The new test passes for both, and big endian has
new PASSes for the vshuf-* execution tests, which currently fail on trunk.
The comment about the failures due to failure to vectorize seems to have been
incorrect.
Link to previous thread:
https://gcc.gnu.org/ml/gcc-patches/2016-02/msg00060.html
Charles Baylis (2):
[ARM] PR68532: Fix up vuzp for big endian
[ARM] PR68532 Fix up vzip recognition for big endian
gcc/config/arm/arm.c | 77 +++++++++++++++++++++------
gcc/config/arm/arm_neon.h | 72 ++++++++++++-------------
gcc/testsuite/gcc.c-torture/execute/pr68532.c | 24 +++++++++
3 files changed, 122 insertions(+), 51 deletions(-)
create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr68532.c
--
1.9.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition for big endian
2016-02-03 18:59 [ARM, PATCH v2 0/2] PR68532: Fix VZIP/VUZP recognition for big endian charles.baylis
2016-02-03 18:59 ` [PATCH 1/2] [ARM] PR68532: Fix up vuzp " charles.baylis
@ 2016-02-03 18:59 ` charles.baylis
2016-02-08 11:42 ` Kyrill Tkachov
1 sibling, 1 reply; 10+ messages in thread
From: charles.baylis @ 2016-02-03 18:59 UTC (permalink / raw)
To: Ramana.Radhakrishnan, kyrylo.tkachov, richard.earnshaw
Cc: rearnsha, gcc-patches, michael.collison
From: Charles Baylis <charles.baylis@linaro.org>
gcc/ChangeLog:
2016-02-03 Charles Baylis <charles.baylis@linaro.org>
PR target/68532
* config/arm/arm.c (arm_evpc_neon_vzip): Allow for big endian lane
order.
* config/arm/arm_neon.h (vzipq_s8): Adjust shuffle patterns for big
endian.
(vzipq_s16): Likewise.
(vzipq_s32): Likewise.
(vzipq_f32): Likewise.
(vzipq_u8): Likewise.
(vzipq_u16): Likewise.
(vzipq_u32): Likewise.
(vzipq_p8): Likewise.
(vzipq_p16): Likewise.
Change-Id: I327678f5e73c1de2f413c1d22769ab42ce1d6c16
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index e9aa982..24239db 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -28318,15 +28318,21 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
unsigned int i, high, mask, nelt = d->nelt;
rtx out0, out1, in0, in1;
rtx (*gen)(rtx, rtx, rtx, rtx);
+ int first_elem;
+ bool is_swapped;
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
+ is_swapped = BYTES_BIG_ENDIAN ? true : false;
+
/* Note that these are little-endian tests. Adjust for big-endian later. */
+ first_elem = d->perm[neon_endian_lane_map (d->vmode, 0) ^ is_swapped];
+
high = nelt / 2;
- if (d->perm[0] == high)
+ if (first_elem == neon_endian_lane_map (d->vmode, high))
;
- else if (d->perm[0] == 0)
+ else if (first_elem == neon_endian_lane_map (d->vmode, 0))
high = 0;
else
return false;
@@ -28334,11 +28340,16 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
for (i = 0; i < nelt / 2; i++)
{
- unsigned elt = (i + high) & mask;
- if (d->perm[i * 2] != elt)
+ unsigned elt =
+ neon_pair_endian_lane_map (d->vmode, i + high) & mask;
+ if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + is_swapped)]
+ != elt)
return false;
- elt = (elt + nelt) & mask;
- if (d->perm[i * 2 + 1] != elt)
+ elt =
+ neon_pair_endian_lane_map (d->vmode, i + nelt + high)
+ & mask;
+ if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + !is_swapped)]
+ != elt)
return false;
}
@@ -28362,10 +28373,9 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
in0 = d->op0;
in1 = d->op1;
- if (BYTES_BIG_ENDIAN)
+ if (is_swapped)
{
std::swap (in0, in1);
- high = !high;
}
out0 = d->target;
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 2e014b6..aa17f49 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8453,9 +8453,9 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b)
int8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
+ { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+ { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
@@ -8471,9 +8471,9 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b)
int16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 12, 4, 13, 5, 14, 6, 15, 7 });
+ { 10, 2, 11, 3, 8, 0, 9, 1 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 0, 9, 1, 10, 2, 11, 3 });
+ { 14, 6, 15, 7, 12, 4, 13, 5 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 8, 1, 9, 2, 10, 3, 11 });
@@ -8488,8 +8488,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
@@ -8502,8 +8502,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
@@ -8517,9 +8517,9 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
uint8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
+ { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+ { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
@@ -8535,9 +8535,9 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
uint16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 12, 4, 13, 5, 14, 6, 15, 7 });
+ { 10, 2, 11, 3, 8, 0, 9, 1 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 0, 9, 1, 10, 2, 11, 3 });
+ { 14, 6, 15, 7, 12, 4, 13, 5 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 8, 1, 9, 2, 10, 3, 11 });
@@ -8552,8 +8552,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
@@ -8567,9 +8567,9 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
poly8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
+ { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+ { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
@@ -8585,9 +8585,9 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
poly16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 12, 4, 13, 5, 14, 6, 15, 7 });
+ { 10, 2, 11, 3, 8, 0, 9, 1 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 0, 9, 1, 10, 2, 11, 3 });
+ { 14, 6, 15, 7, 12, 4, 13, 5 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 8, 1, 9, 2, 10, 3, 11 });
--
1.9.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition for big endian
2016-02-03 18:59 ` [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition " charles.baylis
@ 2016-02-08 11:42 ` Kyrill Tkachov
2016-02-09 17:07 ` Charles Baylis
0 siblings, 1 reply; 10+ messages in thread
From: Kyrill Tkachov @ 2016-02-08 11:42 UTC (permalink / raw)
To: charles.baylis, Ramana.Radhakrishnan, richard.earnshaw
Cc: rearnsha, gcc-patches, michael.collison
Hi Charles,
On 03/02/16 18:59, charles.baylis@linaro.org wrote:
> From: Charles Baylis <charles.baylis@linaro.org>
>
> gcc/ChangeLog:
>
> 2016-02-03 Charles Baylis <charles.baylis@linaro.org>
>
> PR target/68532
> * config/arm/arm.c (arm_evpc_neon_vzip): Allow for big endian lane
> order.
> * config/arm/arm_neon.h (vzipq_s8): Adjust shuffle patterns for big
> endian.
> (vzipq_s16): Likewise.
> (vzipq_s32): Likewise.
> (vzipq_f32): Likewise.
> (vzipq_u8): Likewise.
> (vzipq_u16): Likewise.
> (vzipq_u32): Likewise.
> (vzipq_p8): Likewise.
> (vzipq_p16): Likewise.
>
> Change-Id: I327678f5e73c1de2f413c1d22769ab42ce1d6c16
>
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index e9aa982..24239db 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -28318,15 +28318,21 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
> unsigned int i, high, mask, nelt = d->nelt;
> rtx out0, out1, in0, in1;
> rtx (*gen)(rtx, rtx, rtx, rtx);
> + int first_elem;
> + bool is_swapped;
>
> if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
> return false;
>
> + is_swapped = BYTES_BIG_ENDIAN ? true : false;
This is just "is_swapped = BYTES_BIG_ENDIAN;"
> +
> /* Note that these are little-endian tests. Adjust for big-endian later. */
I think you can remove this comment now, like in patch 1/2
> + first_elem = d->perm[neon_endian_lane_map (d->vmode, 0) ^ is_swapped];
> +
> high = nelt / 2;
> - if (d->perm[0] == high)
> + if (first_elem == neon_endian_lane_map (d->vmode, high))
> ;
> - else if (d->perm[0] == 0)
> + else if (first_elem == neon_endian_lane_map (d->vmode, 0))
> high = 0;
> else
> return false;
> @@ -28334,11 +28340,16 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
>
> for (i = 0; i < nelt / 2; i++)
> {
> - unsigned elt = (i + high) & mask;
> - if (d->perm[i * 2] != elt)
> + unsigned elt =
> + neon_pair_endian_lane_map (d->vmode, i + high) & mask;
> + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + is_swapped)]
> + != elt)
> return false;
> - elt = (elt + nelt) & mask;
> - if (d->perm[i * 2 + 1] != elt)
> + elt =
> + neon_pair_endian_lane_map (d->vmode, i + nelt + high)
> + & mask;
The "& mask" can go on the previous line.
> + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + !is_swapped)]
> + != elt)
> return false;
> }
>
> @@ -28362,10 +28373,9 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
>
> in0 = d->op0;
> in1 = d->op1;
> - if (BYTES_BIG_ENDIAN)
> + if (is_swapped)
> {
> std::swap (in0, in1);
> - high = !high;
> }
remove the braces around the std::swap.
Ok with these changes.
I've tried out both patch and they do fix execution failures on big-endian
and don't break any NEON intrinsics tests that I threw at them.
>
> out0 = d->target;
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index 2e014b6..aa17f49 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -8453,9 +8453,9 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b)
> int8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
> + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
> @@ -8471,9 +8471,9 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b)
> int16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 12, 4, 13, 5, 14, 6, 15, 7 });
> + { 10, 2, 11, 3, 8, 0, 9, 1 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 0, 9, 1, 10, 2, 11, 3 });
> + { 14, 6, 15, 7, 12, 4, 13, 5 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 8, 1, 9, 2, 10, 3, 11 });
> @@ -8488,8 +8488,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b)
> {
> int32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
> @@ -8502,8 +8502,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b)
> {
> float32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
> @@ -8517,9 +8517,9 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
> uint8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
> + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
> @@ -8535,9 +8535,9 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
> uint16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 12, 4, 13, 5, 14, 6, 15, 7 });
> + { 10, 2, 11, 3, 8, 0, 9, 1 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 0, 9, 1, 10, 2, 11, 3 });
> + { 14, 6, 15, 7, 12, 4, 13, 5 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 8, 1, 9, 2, 10, 3, 11 });
> @@ -8552,8 +8552,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
> {
> uint32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
> @@ -8567,9 +8567,9 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
> poly8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
> + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
> @@ -8585,9 +8585,9 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
> poly16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 12, 4, 13, 5, 14, 6, 15, 7 });
> + { 10, 2, 11, 3, 8, 0, 9, 1 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 0, 9, 1, 10, 2, 11, 3 });
> + { 14, 6, 15, 7, 12, 4, 13, 5 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 8, 1, 9, 2, 10, 3, 11 });
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] [ARM] PR68532: Fix up vuzp for big endian
2016-02-03 18:59 ` [PATCH 1/2] [ARM] PR68532: Fix up vuzp " charles.baylis
@ 2016-02-08 11:42 ` Kyrill Tkachov
2016-02-09 17:01 ` Charles Baylis
0 siblings, 1 reply; 10+ messages in thread
From: Kyrill Tkachov @ 2016-02-08 11:42 UTC (permalink / raw)
To: charles.baylis, Ramana.Radhakrishnan, richard.earnshaw
Cc: rearnsha, gcc-patches, michael.collison
Hi Charles,
On 03/02/16 18:59, charles.baylis@linaro.org wrote:
> From: Charles Baylis <charles.baylis@linaro.org>
>
> gcc/ChangeLog:
>
> 2016-02-03 Charles Baylis <charles.baylis@linaro.org>
>
> PR target/68532
> * config/arm/arm.c (neon_endian_lane_map): New function.
> (neon_vector_pair_endian_lane_map): New function.
> (arm_evpc_neon_vuzp): Allow for big endian lane order.
> * config/arm/arm_neon.h (vuzpq_s8): Adjust shuffle patterns for big
> endian.
> (vuzpq_s16): Likewise.
> (vuzpq_s32): Likewise.
> (vuzpq_f32): Likewise.
> (vuzpq_u8): Likewise.
> (vuzpq_u16): Likewise.
> (vuzpq_u32): Likewise.
> (vuzpq_p8): Likewise.
> (vuzpq_p16): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 2015-12-15 Charles Baylis <charles.baylis@linaro.org>
>
> PR target/68532
> * gcc.c-torture/execute/pr68532.c: New test.
>
> Change-Id: Ifd35d79bd42825f05403a1b96d8f34ef0f21dac3
>
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index d8a2745..e9aa982 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -28208,6 +28208,35 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
> arm_expand_vec_perm_1 (target, op0, op1, sel);
> }
>
> +/* map lane ordering between architectural lane order, and GCC lane order,
> + taking into account ABI. See comment above output_move_neon for details. */
> +static int
> +neon_endian_lane_map (machine_mode mode, int lane)
s/map/Map/
New line between comment and function signature.
> +{
> + if (BYTES_BIG_ENDIAN)
> + {
> + int nelems = GET_MODE_NUNITS (mode);
> + /* Reverse lane order. */
> + lane = (nelems - 1 - lane);
> + /* Reverse D register order, to match ABI. */
> + if (GET_MODE_SIZE (mode) == 16)
> + lane = lane ^ (nelems / 2);
> + }
> + return lane;
> +}
> +
> +/* some permutations index into pairs of vectors, this is a helper function
> + to map indexes into those pairs of vectors. */
> +static int
> +neon_pair_endian_lane_map (machine_mode mode, int lane)
Similarly, s/some/Some/ and new line after comment.
> +{
> + int nelem = GET_MODE_NUNITS (mode);
> + if (BYTES_BIG_ENDIAN)
> + lane =
> + neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
> + return lane;
> +}
> +
> /* Generate or test for an insn that supports a constant permutation. */
>
> /* Recognize patterns for the VUZP insns. */
> @@ -28218,14 +28247,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
> unsigned int i, odd, mask, nelt = d->nelt;
> rtx out0, out1, in0, in1;
> rtx (*gen)(rtx, rtx, rtx, rtx);
> + int first_elem;
> + int swap;
>
Just make this a bool.
> if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
> return false;
>
> - /* Note that these are little-endian tests. Adjust for big-endian later. */
> - if (d->perm[0] == 0)
> + /* arm_expand_vec_perm_const_1 () helpfully swaps the operands for the
> + big endian pattern on 64 bit vectors, so we correct for that. */
> + swap = BYTES_BIG_ENDIAN && !d->one_vector_p
> + && GET_MODE_SIZE (d->vmode) == 8 ? d->nelt : 0;
> +
> + first_elem = d->perm[neon_endian_lane_map (d->vmode, 0)] ^ swap;
> +
> + if (first_elem == neon_endian_lane_map (d->vmode, 0))
> odd = 0;
> - else if (d->perm[0] == 1)
> + else if (first_elem == neon_endian_lane_map (d->vmode, 1))
> odd = 1;
> else
> return false;
> @@ -28233,8 +28270,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
>
> for (i = 0; i < nelt; i++)
> {
> - unsigned elt = (i * 2 + odd) & mask;
> - if (d->perm[i] != elt)
> + unsigned elt =
> + (neon_pair_endian_lane_map (d->vmode, i) * 2 + odd) & mask;
> + if ((d->perm[i] ^ swap) != neon_pair_endian_lane_map (d->vmode, elt))
> return false;
> }
>
> @@ -28258,10 +28296,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
>
> in0 = d->op0;
> in1 = d->op1;
> - if (BYTES_BIG_ENDIAN)
> + if (swap)
> {
> std::swap (in0, in1);
> - odd = !odd;
> }
remove the braces around the std::swap
>
> out0 = d->target;
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index 47816d5..2e014b6 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -8741,9 +8741,9 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
> int8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
> + { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
> + { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
> @@ -8759,9 +8759,9 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
> int16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 9, 11, 13, 15, 1, 3, 5, 7 });
> + { 5, 7, 1, 3, 13, 15, 9, 11 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 10, 12, 14, 0, 2, 4, 6 });
> + { 4, 6, 0, 2, 12, 14, 8, 10 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 2, 4, 6, 8, 10, 12, 14 });
> @@ -8776,8 +8776,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
> {
> int32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
> @@ -8790,8 +8790,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
> {
> float32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
> @@ -8805,9 +8805,9 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
> uint8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
> + { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
> + { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
> @@ -8823,9 +8823,9 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
> uint16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 9, 11, 13, 15, 1, 3, 5, 7 });
> + { 5, 7, 1, 3, 13, 15, 9, 11 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 10, 12, 14, 0, 2, 4, 6 });
> + { 4, 6, 0, 2, 12, 14, 8, 10 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 2, 4, 6, 8, 10, 12, 14 });
> @@ -8840,8 +8840,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
> {
> uint32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
> @@ -8855,9 +8855,9 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
> poly8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
> + { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
> + { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
> @@ -8873,9 +8873,9 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
> poly16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 9, 11, 13, 15, 1, 3, 5, 7 });
> + { 5, 7, 1, 3, 13, 15, 9, 11 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 10, 12, 14, 0, 2, 4, 6 });
> + { 4, 6, 0, 2, 12, 14, 8, 10 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 2, 4, 6, 8, 10, 12, 14 });
> diff --git a/gcc/testsuite/gcc.c-torture/execute/pr68532.c b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
> new file mode 100644
> index 0000000..3c40aa8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
> @@ -0,0 +1,24 @@
> +/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
> +
> +#define SIZE 128
> +unsigned short _Alignas (16) in[SIZE];
> +
> +extern void abort (void);
> +
> +__attribute__ ((noinline)) int
> +test (unsigned short sum, unsigned short *in, int x)
> +{
> + for (int j = 0; j < SIZE; j += 8)
> + sum += in[j] * x;
> + return sum;
> +}
> +
> +int
> +main ()
> +{
> + for (int i = 0; i < SIZE; i++)
> + in[i] = i;
> + if (test (0, in, 1) != 960)
> + abort ();
AFAIK tests here usually prefer __builtin_abort ();
That way you don't have to declare the abort prototype in the beginning.
Ok with those changes.
Thanks,
Kyrill
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] [ARM] PR68532: Fix up vuzp for big endian
2016-02-08 11:42 ` Kyrill Tkachov
@ 2016-02-09 17:01 ` Charles Baylis
2016-02-09 17:08 ` Kyrill Tkachov
0 siblings, 1 reply; 10+ messages in thread
From: Charles Baylis @ 2016-02-09 17:01 UTC (permalink / raw)
To: Kyrill Tkachov
Cc: Ramana Radhakrishnan, Richard Earnshaw, Richard Earnshaw,
GCC Patches, Michael Collison
[-- Attachment #1: Type: text/plain, Size: 3071 bytes --]
On 8 February 2016 at 11:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
> Hi Charles,
>
>
> On 03/02/16 18:59, charles.baylis@linaro.org wrote:
>>
>> --- a/gcc/config/arm/arm.c
>> +++ b/gcc/config/arm/arm.c
>> @@ -28208,6 +28208,35 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx
>> op1, rtx sel)
>> arm_expand_vec_perm_1 (target, op0, op1, sel);
>> }
>> +/* map lane ordering between architectural lane order, and GCC lane
>> order,
>> + taking into account ABI. See comment above output_move_neon for
>> details. */
>> +static int
>> +neon_endian_lane_map (machine_mode mode, int lane)
>
>
> s/map/Map/
> New line between comment and function signature.
Done.
>> +{
>> + if (BYTES_BIG_ENDIAN)
>> + {
>> + int nelems = GET_MODE_NUNITS (mode);
>> + /* Reverse lane order. */
>> + lane = (nelems - 1 - lane);
>> + /* Reverse D register order, to match ABI. */
>> + if (GET_MODE_SIZE (mode) == 16)
>> + lane = lane ^ (nelems / 2);
>> + }
>> + return lane;
>> +}
>> +
>> +/* some permutations index into pairs of vectors, this is a helper
>> function
>> + to map indexes into those pairs of vectors. */
>> +static int
>> +neon_pair_endian_lane_map (machine_mode mode, int lane)
>
>
> Similarly, s/some/Some/ and new line after comment.
Done.
>> +{
>> + int nelem = GET_MODE_NUNITS (mode);
>> + if (BYTES_BIG_ENDIAN)
>> + lane =
>> + neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
>> + return lane;
>> +}
>> +
>> /* Generate or test for an insn that supports a constant permutation.
>> */
>> /* Recognize patterns for the VUZP insns. */
>> @@ -28218,14 +28247,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
>> unsigned int i, odd, mask, nelt = d->nelt;
>> rtx out0, out1, in0, in1;
>> rtx (*gen)(rtx, rtx, rtx, rtx);
>> + int first_elem;
>> + int swap;
>>
>
> Just make this a bool.
As discussed on IRC, this variable does contain an integer. I have
renamed it as swap_nelt, and changed the test on it below.
[snip]
>> @@ -28258,10 +28296,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d
>> *d)
>> in0 = d->op0;
>> in1 = d->op1;
>> - if (BYTES_BIG_ENDIAN)
>> + if (swap)
>> {
>> std::swap (in0, in1);
>> - odd = !odd;
>> }
>
> remove the braces around the std::swap
Done. Also changed if (swap) to if (swap_nelt != 0)
[snip]
>> @@ -0,0 +1,24 @@
>> +/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
>> +
>> +#define SIZE 128
>> +unsigned short _Alignas (16) in[SIZE];
>> +
>> +extern void abort (void);
>> +
>> +__attribute__ ((noinline)) int
>> +test (unsigned short sum, unsigned short *in, int x)
>> +{
>> + for (int j = 0; j < SIZE; j += 8)
>> + sum += in[j] * x;
>> + return sum;
>> +}
>> +
>> +int
>> +main ()
>> +{
>> + for (int i = 0; i < SIZE; i++)
>> + in[i] = i;
>> + if (test (0, in, 1) != 960)
>> + abort ();
>
>
> AFAIK tests here usually prefer __builtin_abort ();
> That way you don't have to declare the abort prototype in the beginning.
Done.
Updated patch attached
[-- Attachment #2: 0001-ARM-PR68532-Fix-up-vuzp-for-big-endian.patch --]
[-- Type: text/x-diff, Size: 9836 bytes --]
From 99a536e2e10e3759a5de88422fadcabb22084b2f Mon Sep 17 00:00:00 2001
From: Charles Baylis <charles.baylis@linaro.org>
Date: Tue, 9 Feb 2016 15:18:43 +0000
Subject: [PATCH 1/2] [ARM] PR68532: Fix up vuzp for big endian
gcc/ChangeLog:
2016-02-09 Charles Baylis <charles.baylis@linaro.org>
PR target/68532
* config/arm/arm.c (neon_endian_lane_map): New function.
(neon_vector_pair_endian_lane_map): New function.
(arm_evpc_neon_vuzp): Allow for big endian lane order.
* config/arm/arm_neon.h (vuzpq_s8): Adjust shuffle patterns for big
endian.
(vuzpq_s16): Likewise.
(vuzpq_s32): Likewise.
(vuzpq_f32): Likewise.
(vuzpq_u8): Likewise.
(vuzpq_u16): Likewise.
(vuzpq_u32): Likewise.
(vuzpq_p8): Likewise.
(vuzpq_p16): Likewise.
gcc/testsuite/ChangeLog:
2016-02-09 Charles Baylis <charles.baylis@linaro.org>
PR target/68532
* gcc.c-torture/execute/pr68532.c: New test.
Change-Id: Ifd35d79bd42825f05403a1b96d8f34ef0f21dac3
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index d8a2745..95ee9a5 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -28208,6 +28208,37 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
arm_expand_vec_perm_1 (target, op0, op1, sel);
}
+/* Map lane ordering between architectural lane order, and GCC lane order,
+ taking into account ABI. See comment above output_move_neon for details. */
+
+static int
+neon_endian_lane_map (machine_mode mode, int lane)
+{
+ if (BYTES_BIG_ENDIAN)
+ {
+ int nelems = GET_MODE_NUNITS (mode);
+ /* Reverse lane order. */
+ lane = (nelems - 1 - lane);
+ /* Reverse D register order, to match ABI. */
+ if (GET_MODE_SIZE (mode) == 16)
+ lane = lane ^ (nelems / 2);
+ }
+ return lane;
+}
+
+/* Some permutations index into pairs of vectors, this is a helper function
+ to map indexes into those pairs of vectors. */
+
+static int
+neon_pair_endian_lane_map (machine_mode mode, int lane)
+{
+ int nelem = GET_MODE_NUNITS (mode);
+ if (BYTES_BIG_ENDIAN)
+ lane =
+ neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
+ return lane;
+}
+
/* Generate or test for an insn that supports a constant permutation. */
/* Recognize patterns for the VUZP insns. */
@@ -28218,14 +28249,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
unsigned int i, odd, mask, nelt = d->nelt;
rtx out0, out1, in0, in1;
rtx (*gen)(rtx, rtx, rtx, rtx);
+ int first_elem;
+ int swap_nelt;
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
- /* Note that these are little-endian tests. Adjust for big-endian later. */
- if (d->perm[0] == 0)
+ /* arm_expand_vec_perm_const_1 () helpfully swaps the operands for the
+ big endian pattern on 64 bit vectors, so we correct for that. */
+ swap_nelt = BYTES_BIG_ENDIAN && !d->one_vector_p
+ && GET_MODE_SIZE (d->vmode) == 8 ? d->nelt : 0;
+
+ first_elem = d->perm[neon_endian_lane_map (d->vmode, 0)] ^ swap_nelt;
+
+ if (first_elem == neon_endian_lane_map (d->vmode, 0))
odd = 0;
- else if (d->perm[0] == 1)
+ else if (first_elem == neon_endian_lane_map (d->vmode, 1))
odd = 1;
else
return false;
@@ -28233,8 +28272,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
for (i = 0; i < nelt; i++)
{
- unsigned elt = (i * 2 + odd) & mask;
- if (d->perm[i] != elt)
+ unsigned elt =
+ (neon_pair_endian_lane_map (d->vmode, i) * 2 + odd) & mask;
+ if ((d->perm[i] ^ swap_nelt) != neon_pair_endian_lane_map (d->vmode, elt))
return false;
}
@@ -28258,11 +28298,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
in0 = d->op0;
in1 = d->op1;
- if (BYTES_BIG_ENDIAN)
- {
- std::swap (in0, in1);
- odd = !odd;
- }
+ if (swap_nelt != 0)
+ std::swap (in0, in1);
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 47816d5..2e014b6 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8741,9 +8741,9 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
int8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+ { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+ { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8759,9 +8759,9 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
int16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 9, 11, 13, 15, 1, 3, 5, 7 });
+ { 5, 7, 1, 3, 13, 15, 9, 11 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 10, 12, 14, 0, 2, 4, 6 });
+ { 4, 6, 0, 2, 12, 14, 8, 10 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 2, 4, 6, 8, 10, 12, 14 });
@@ -8776,8 +8776,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8790,8 +8790,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8805,9 +8805,9 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
uint8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+ { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+ { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8823,9 +8823,9 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
uint16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 9, 11, 13, 15, 1, 3, 5, 7 });
+ { 5, 7, 1, 3, 13, 15, 9, 11 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 10, 12, 14, 0, 2, 4, 6 });
+ { 4, 6, 0, 2, 12, 14, 8, 10 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 2, 4, 6, 8, 10, 12, 14 });
@@ -8840,8 +8840,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8855,9 +8855,9 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
poly8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+ { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+ { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8873,9 +8873,9 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
poly16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 9, 11, 13, 15, 1, 3, 5, 7 });
+ { 5, 7, 1, 3, 13, 15, 9, 11 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 10, 12, 14, 0, 2, 4, 6 });
+ { 4, 6, 0, 2, 12, 14, 8, 10 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 2, 4, 6, 8, 10, 12, 14 });
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr68532.c b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
new file mode 100644
index 0000000..5d4bd8e
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
@@ -0,0 +1,22 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#define SIZE 128
+unsigned short _Alignas (16) in[SIZE];
+
+__attribute__ ((noinline)) int
+test (unsigned short sum, unsigned short *in, int x)
+{
+ for (int j = 0; j < SIZE; j += 8)
+ sum += in[j] * x;
+ return sum;
+}
+
+int
+main ()
+{
+ for (int i = 0; i < SIZE; i++)
+ in[i] = i;
+ if (test (0, in, 1) != 960)
+ __builtin_abort ();
+ return 0;
+}
--
1.9.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition for big endian
2016-02-08 11:42 ` Kyrill Tkachov
@ 2016-02-09 17:07 ` Charles Baylis
2016-02-09 18:53 ` Charles Baylis
0 siblings, 1 reply; 10+ messages in thread
From: Charles Baylis @ 2016-02-09 17:07 UTC (permalink / raw)
To: Kyrill Tkachov
Cc: Ramana Radhakrishnan, Richard Earnshaw, Richard Earnshaw,
GCC Patches, Michael Collison
[-- Attachment #1: Type: text/plain, Size: 2504 bytes --]
On 8 February 2016 at 11:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
> On 03/02/16 18:59, charles.baylis@linaro.org wrote:
>> --- a/gcc/config/arm/arm.c
>> +++ b/gcc/config/arm/arm.c
>> @@ -28318,15 +28318,21 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
>> unsigned int i, high, mask, nelt = d->nelt;
>> rtx out0, out1, in0, in1;
>> rtx (*gen)(rtx, rtx, rtx, rtx);
>> + int first_elem;
>> + bool is_swapped;
>> if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
>> return false;
>> + is_swapped = BYTES_BIG_ENDIAN ? true : false;
>
>
> This is just "is_swapped = BYTES_BIG_ENDIAN;"
Done.
>> +
>> /* Note that these are little-endian tests. Adjust for big-endian
>> later. */
>
>
> I think you can remove this comment now, like in patch 1/2
Done.
>> + first_elem = d->perm[neon_endian_lane_map (d->vmode, 0) ^ is_swapped];
>> +
>> high = nelt / 2;
>> - if (d->perm[0] == high)
>> + if (first_elem == neon_endian_lane_map (d->vmode, high))
>> ;
>> - else if (d->perm[0] == 0)
>> + else if (first_elem == neon_endian_lane_map (d->vmode, 0))
>> high = 0;
>> else
>> return false;
>> @@ -28334,11 +28340,16 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
>> for (i = 0; i < nelt / 2; i++)
>> {
>> - unsigned elt = (i + high) & mask;
>> - if (d->perm[i * 2] != elt)
>> + unsigned elt =
>> + neon_pair_endian_lane_map (d->vmode, i + high) & mask;
>> + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i +
>> is_swapped)]
>> + != elt)
>> return false;
>> - elt = (elt + nelt) & mask;
>> - if (d->perm[i * 2 + 1] != elt)
>> + elt =
>> + neon_pair_endian_lane_map (d->vmode, i + nelt + high)
>> + & mask;
>
>
> The "& mask" can go on the previous line.
Done
>> + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i +
>> !is_swapped)]
>> + != elt)
>> return false;
>> }
>> @@ -28362,10 +28373,9 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d
>> *d)
>> in0 = d->op0;
>> in1 = d->op1;
>> - if (BYTES_BIG_ENDIAN)
>> + if (is_swapped)
>> {
>> std::swap (in0, in1);
>> - high = !high;
>> }
>
>
> remove the braces around the std::swap.
Done.
> Ok with these changes.
> I've tried out both patch and they do fix execution failures on big-endian
> and don't break any NEON intrinsics tests that I threw at them.
Attached for completeness, will commit once the VUZP patch is OKd.
[-- Attachment #2: 0002-ARM-PR68532-Fix-up-vzip-recognition-for-big-endian.patch --]
[-- Type: text/x-diff, Size: 7847 bytes --]
From 469f82610a4e70284bf23c373b8a73685cad0ec1 Mon Sep 17 00:00:00 2001
From: Charles Baylis <charles.baylis@linaro.org>
Date: Tue, 9 Feb 2016 15:18:44 +0000
Subject: [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition for big endian
gcc/ChangeLog:
2016-02-09 Charles Baylis <charles.baylis@linaro.org>
PR target/68532
* config/arm/arm.c (arm_evpc_neon_vzip): Allow for big endian lane
order.
* config/arm/arm_neon.h (vzipq_s8): Adjust shuffle patterns for big
endian.
(vzipq_s16): Likewise.
(vzipq_s32): Likewise.
(vzipq_f32): Likewise.
(vzipq_u8): Likewise.
(vzipq_u16): Likewise.
(vzipq_u32): Likewise.
(vzipq_p8): Likewise.
(vzipq_p16): Likewise.
Change-Id: I327678f5e73c1de2f413c1d22769ab42ce1d6c16
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 95ee9a5..5562baa 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -28318,15 +28318,20 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
unsigned int i, high, mask, nelt = d->nelt;
rtx out0, out1, in0, in1;
rtx (*gen)(rtx, rtx, rtx, rtx);
+ int first_elem;
+ bool is_swapped;
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
- /* Note that these are little-endian tests. Adjust for big-endian later. */
+ is_swapped = BYTES_BIG_ENDIAN;
+
+ first_elem = d->perm[neon_endian_lane_map (d->vmode, 0) ^ is_swapped];
+
high = nelt / 2;
- if (d->perm[0] == high)
+ if (first_elem == neon_endian_lane_map (d->vmode, high))
;
- else if (d->perm[0] == 0)
+ else if (first_elem == neon_endian_lane_map (d->vmode, 0))
high = 0;
else
return false;
@@ -28334,11 +28339,15 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
for (i = 0; i < nelt / 2; i++)
{
- unsigned elt = (i + high) & mask;
- if (d->perm[i * 2] != elt)
+ unsigned elt =
+ neon_pair_endian_lane_map (d->vmode, i + high) & mask;
+ if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + is_swapped)]
+ != elt)
return false;
- elt = (elt + nelt) & mask;
- if (d->perm[i * 2 + 1] != elt)
+ elt =
+ neon_pair_endian_lane_map (d->vmode, i + nelt + high) & mask;
+ if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + !is_swapped)]
+ != elt)
return false;
}
@@ -28362,11 +28371,8 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
in0 = d->op0;
in1 = d->op1;
- if (BYTES_BIG_ENDIAN)
- {
- std::swap (in0, in1);
- high = !high;
- }
+ if (is_swapped)
+ std::swap (in0, in1);
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 2e014b6..aa17f49 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8453,9 +8453,9 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b)
int8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
+ { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+ { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
@@ -8471,9 +8471,9 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b)
int16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 12, 4, 13, 5, 14, 6, 15, 7 });
+ { 10, 2, 11, 3, 8, 0, 9, 1 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 0, 9, 1, 10, 2, 11, 3 });
+ { 14, 6, 15, 7, 12, 4, 13, 5 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 8, 1, 9, 2, 10, 3, 11 });
@@ -8488,8 +8488,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
@@ -8502,8 +8502,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
@@ -8517,9 +8517,9 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
uint8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
+ { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+ { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
@@ -8535,9 +8535,9 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
uint16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 12, 4, 13, 5, 14, 6, 15, 7 });
+ { 10, 2, 11, 3, 8, 0, 9, 1 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 0, 9, 1, 10, 2, 11, 3 });
+ { 14, 6, 15, 7, 12, 4, 13, 5 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 8, 1, 9, 2, 10, 3, 11 });
@@ -8552,8 +8552,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
- __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 });
- __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 });
+ __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 });
+ __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
@@ -8567,9 +8567,9 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
poly8x16x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 });
+ { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
- { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+ { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
@@ -8585,9 +8585,9 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
poly16x8x2_t __rv;
#ifdef __ARM_BIG_ENDIAN
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 12, 4, 13, 5, 14, 6, 15, 7 });
+ { 10, 2, 11, 3, 8, 0, 9, 1 });
__rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
- { 8, 0, 9, 1, 10, 2, 11, 3 });
+ { 14, 6, 15, 7, 12, 4, 13, 5 });
#else
__rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
{ 0, 8, 1, 9, 2, 10, 3, 11 });
--
1.9.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] [ARM] PR68532: Fix up vuzp for big endian
2016-02-09 17:01 ` Charles Baylis
@ 2016-02-09 17:08 ` Kyrill Tkachov
2016-02-09 18:54 ` Charles Baylis
0 siblings, 1 reply; 10+ messages in thread
From: Kyrill Tkachov @ 2016-02-09 17:08 UTC (permalink / raw)
To: Charles Baylis
Cc: Ramana Radhakrishnan, Richard Earnshaw, Richard Earnshaw,
GCC Patches, Michael Collison
On 09/02/16 17:00, Charles Baylis wrote:
> On 8 February 2016 at 11:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
>> Hi Charles,
>>
>>
>> On 03/02/16 18:59, charles.baylis@linaro.org wrote:
>>> --- a/gcc/config/arm/arm.c
>>> +++ b/gcc/config/arm/arm.c
>>> @@ -28208,6 +28208,35 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx
>>> op1, rtx sel)
>>> arm_expand_vec_perm_1 (target, op0, op1, sel);
>>> }
>>> +/* map lane ordering between architectural lane order, and GCC lane
>>> order,
>>> + taking into account ABI. See comment above output_move_neon for
>>> details. */
>>> +static int
>>> +neon_endian_lane_map (machine_mode mode, int lane)
>>
>> s/map/Map/
>> New line between comment and function signature.
> Done.
>
>>> +{
>>> + if (BYTES_BIG_ENDIAN)
>>> + {
>>> + int nelems = GET_MODE_NUNITS (mode);
>>> + /* Reverse lane order. */
>>> + lane = (nelems - 1 - lane);
>>> + /* Reverse D register order, to match ABI. */
>>> + if (GET_MODE_SIZE (mode) == 16)
>>> + lane = lane ^ (nelems / 2);
>>> + }
>>> + return lane;
>>> +}
>>> +
>>> +/* some permutations index into pairs of vectors, this is a helper
>>> function
>>> + to map indexes into those pairs of vectors. */
>>> +static int
>>> +neon_pair_endian_lane_map (machine_mode mode, int lane)
>>
>> Similarly, s/some/Some/ and new line after comment.
> Done.
>
>>> +{
>>> + int nelem = GET_MODE_NUNITS (mode);
>>> + if (BYTES_BIG_ENDIAN)
>>> + lane =
>>> + neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
>>> + return lane;
>>> +}
>>> +
>>> /* Generate or test for an insn that supports a constant permutation.
>>> */
>>> /* Recognize patterns for the VUZP insns. */
>>> @@ -28218,14 +28247,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
>>> unsigned int i, odd, mask, nelt = d->nelt;
>>> rtx out0, out1, in0, in1;
>>> rtx (*gen)(rtx, rtx, rtx, rtx);
>>> + int first_elem;
>>> + int swap;
>>>
>> Just make this a bool.
> As discussed on IRC, this variable does contain an integer. I have
> renamed it as swap_nelt, and changed the test on it below.
This is ok.
Thanks,
Kyrill
> [snip]
>
>>> @@ -28258,10 +28296,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d
>>> *d)
>>> in0 = d->op0;
>>> in1 = d->op1;
>>> - if (BYTES_BIG_ENDIAN)
>>> + if (swap)
>>> {
>>> std::swap (in0, in1);
>>> - odd = !odd;
>>> }
>> remove the braces around the std::swap
> Done. Also changed if (swap) to if (swap_nelt != 0)
>
> [snip]
>
>>> @@ -0,0 +1,24 @@
>>> +/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
>>> +
>>> +#define SIZE 128
>>> +unsigned short _Alignas (16) in[SIZE];
>>> +
>>> +extern void abort (void);
>>> +
>>> +__attribute__ ((noinline)) int
>>> +test (unsigned short sum, unsigned short *in, int x)
>>> +{
>>> + for (int j = 0; j < SIZE; j += 8)
>>> + sum += in[j] * x;
>>> + return sum;
>>> +}
>>> +
>>> +int
>>> +main ()
>>> +{
>>> + for (int i = 0; i < SIZE; i++)
>>> + in[i] = i;
>>> + if (test (0, in, 1) != 960)
>>> + abort ();
>>
>> AFAIK tests here usually prefer __builtin_abort ();
>> That way you don't have to declare the abort prototype in the beginning.
> Done.
>
> Updated patch attached
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition for big endian
2016-02-09 17:07 ` Charles Baylis
@ 2016-02-09 18:53 ` Charles Baylis
0 siblings, 0 replies; 10+ messages in thread
From: Charles Baylis @ 2016-02-09 18:53 UTC (permalink / raw)
To: Kyrill Tkachov
Cc: Ramana Radhakrishnan, Richard Earnshaw, Richard Earnshaw,
GCC Patches, Michael Collison
Committed to trunk as r233252
On 9 February 2016 at 17:07, Charles Baylis <charles.baylis@linaro.org> wrote:
> On 8 February 2016 at 11:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
>
>> On 03/02/16 18:59, charles.baylis@linaro.org wrote:
>>> --- a/gcc/config/arm/arm.c
>>> +++ b/gcc/config/arm/arm.c
>>> @@ -28318,15 +28318,21 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
>>> unsigned int i, high, mask, nelt = d->nelt;
>>> rtx out0, out1, in0, in1;
>>> rtx (*gen)(rtx, rtx, rtx, rtx);
>>> + int first_elem;
>>> + bool is_swapped;
>>> if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
>>> return false;
>>> + is_swapped = BYTES_BIG_ENDIAN ? true : false;
>>
>>
>> This is just "is_swapped = BYTES_BIG_ENDIAN;"
>
> Done.
>
>>> +
>>> /* Note that these are little-endian tests. Adjust for big-endian
>>> later. */
>>
>>
>> I think you can remove this comment now, like in patch 1/2
>
> Done.
>
>>> + first_elem = d->perm[neon_endian_lane_map (d->vmode, 0) ^ is_swapped];
>>> +
>>> high = nelt / 2;
>>> - if (d->perm[0] == high)
>>> + if (first_elem == neon_endian_lane_map (d->vmode, high))
>>> ;
>>> - else if (d->perm[0] == 0)
>>> + else if (first_elem == neon_endian_lane_map (d->vmode, 0))
>>> high = 0;
>>> else
>>> return false;
>>> @@ -28334,11 +28340,16 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
>>> for (i = 0; i < nelt / 2; i++)
>>> {
>>> - unsigned elt = (i + high) & mask;
>>> - if (d->perm[i * 2] != elt)
>>> + unsigned elt =
>>> + neon_pair_endian_lane_map (d->vmode, i + high) & mask;
>>> + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i +
>>> is_swapped)]
>>> + != elt)
>>> return false;
>>> - elt = (elt + nelt) & mask;
>>> - if (d->perm[i * 2 + 1] != elt)
>>> + elt =
>>> + neon_pair_endian_lane_map (d->vmode, i + nelt + high)
>>> + & mask;
>>
>>
>> The "& mask" can go on the previous line.
>
> Done
>
>>> + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i +
>>> !is_swapped)]
>>> + != elt)
>>> return false;
>>> }
>>> @@ -28362,10 +28373,9 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d
>>> *d)
>>> in0 = d->op0;
>>> in1 = d->op1;
>>> - if (BYTES_BIG_ENDIAN)
>>> + if (is_swapped)
>>> {
>>> std::swap (in0, in1);
>>> - high = !high;
>>> }
>>
>>
>> remove the braces around the std::swap.
>
> Done.
>
>> Ok with these changes.
>> I've tried out both patch and they do fix execution failures on big-endian
>> and don't break any NEON intrinsics tests that I threw at them.
>
> Attached for completeness, will commit once the VUZP patch is OKd.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] [ARM] PR68532: Fix up vuzp for big endian
2016-02-09 17:08 ` Kyrill Tkachov
@ 2016-02-09 18:54 ` Charles Baylis
0 siblings, 0 replies; 10+ messages in thread
From: Charles Baylis @ 2016-02-09 18:54 UTC (permalink / raw)
To: Kyrill Tkachov
Cc: Ramana Radhakrishnan, Richard Earnshaw, Richard Earnshaw,
GCC Patches, Michael Collison
On 9 February 2016 at 17:08, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
>
> On 09/02/16 17:00, Charles Baylis wrote:
>>
>> On 8 February 2016 at 11:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com>
>> wrote:
>>>
>>> Hi Charles,
>>>
>>>
>>> On 03/02/16 18:59, charles.baylis@linaro.org wrote:
>>>>
>>>> --- a/gcc/config/arm/arm.c
>>>> +++ b/gcc/config/arm/arm.c
>>>> @@ -28208,6 +28208,35 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx
>>>> op1, rtx sel)
>>>> arm_expand_vec_perm_1 (target, op0, op1, sel);
>>>> }
>>>> +/* map lane ordering between architectural lane order, and GCC lane
>>>> order,
>>>> + taking into account ABI. See comment above output_move_neon for
>>>> details. */
>>>> +static int
>>>> +neon_endian_lane_map (machine_mode mode, int lane)
>>>
>>>
>>> s/map/Map/
>>> New line between comment and function signature.
>>
>> Done.
>>
>>>> +{
>>>> + if (BYTES_BIG_ENDIAN)
>>>> + {
>>>> + int nelems = GET_MODE_NUNITS (mode);
>>>> + /* Reverse lane order. */
>>>> + lane = (nelems - 1 - lane);
>>>> + /* Reverse D register order, to match ABI. */
>>>> + if (GET_MODE_SIZE (mode) == 16)
>>>> + lane = lane ^ (nelems / 2);
>>>> + }
>>>> + return lane;
>>>> +}
>>>> +
>>>> +/* some permutations index into pairs of vectors, this is a helper
>>>> function
>>>> + to map indexes into those pairs of vectors. */
>>>> +static int
>>>> +neon_pair_endian_lane_map (machine_mode mode, int lane)
>>>
>>>
>>> Similarly, s/some/Some/ and new line after comment.
>>
>> Done.
>>
>>>> +{
>>>> + int nelem = GET_MODE_NUNITS (mode);
>>>> + if (BYTES_BIG_ENDIAN)
>>>> + lane =
>>>> + neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
>>>> + return lane;
>>>> +}
>>>> +
>>>> /* Generate or test for an insn that supports a constant permutation.
>>>> */
>>>> /* Recognize patterns for the VUZP insns. */
>>>> @@ -28218,14 +28247,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d
>>>> *d)
>>>> unsigned int i, odd, mask, nelt = d->nelt;
>>>> rtx out0, out1, in0, in1;
>>>> rtx (*gen)(rtx, rtx, rtx, rtx);
>>>> + int first_elem;
>>>> + int swap;
>>>>
>>> Just make this a bool.
>>
>> As discussed on IRC, this variable does contain an integer. I have
>> renamed it as swap_nelt, and changed the test on it below.
>
>
> This is ok.
Thanks. Committed to trunk as r233251
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2016-02-09 18:54 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-03 18:59 [ARM, PATCH v2 0/2] PR68532: Fix VZIP/VUZP recognition for big endian charles.baylis
2016-02-03 18:59 ` [PATCH 1/2] [ARM] PR68532: Fix up vuzp " charles.baylis
2016-02-08 11:42 ` Kyrill Tkachov
2016-02-09 17:01 ` Charles Baylis
2016-02-09 17:08 ` Kyrill Tkachov
2016-02-09 18:54 ` Charles Baylis
2016-02-03 18:59 ` [PATCH 2/2] [ARM] PR68532 Fix up vzip recognition " charles.baylis
2016-02-08 11:42 ` Kyrill Tkachov
2016-02-09 17:07 ` Charles Baylis
2016-02-09 18:53 ` Charles Baylis
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).