* [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics
@ 2021-07-23 8:21 Jonathan Wright
2021-07-23 9:15 ` Kyrylo Tkachov
2021-08-03 9:42 ` Christophe Lyon
0 siblings, 2 replies; 7+ messages in thread
From: Jonathan Wright @ 2021-07-23 8:21 UTC (permalink / raw)
To: gcc-patches; +Cc: Kyrylo Tkachov, Richard Sandiford
[-- Attachment #1: Type: text/plain, Size: 1542 bytes --]
Hi,
This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.
Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vqtbl[234] intrinsics.
Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.
Ok for master?
Thanks,
Jonathan
---
gcc/ChangeLog:
2021-07-08 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vqtbl2_u8): Likewise.
(vqtbl2_p8): Likewise.
(vqtbl2q_s8): Likewise.
(vqtbl2q_u8): Likewise.
(vqtbl2q_p8): Likewise.
(vqtbl3_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_ci one vector at a time.
(vqtbl3_u8): Likewise.
(vqtbl3_p8): Likewise.
(vqtbl3q_s8): Likewise.
(vqtbl3q_u8): Likewise.
(vqtbl3q_p8): Likewise.
(vqtbl4_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_xi one vector at a time.
(vqtbl4_u8): Likewise.
(vqtbl4_p8): Likewise.
(vqtbl4q_s8): Likewise.
(vqtbl4q_u8): Likewise.
(vqtbl4q_p8): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vector_structure_intrinsics.c: New test.
[-- Attachment #2: rb14639.patch --]
[-- Type: application/octet-stream, Size: 11961 bytes --]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 1048d7c7eaac14554142eaa7544159a50929b7f1..31ae86e6d25239359045d649bf8d00d8c0fa9212 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -23321,8 +23321,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return __builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
}
@@ -23331,8 +23330,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
}
@@ -23341,8 +23339,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
}
@@ -23351,8 +23348,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return __builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx);
}
@@ -23361,8 +23357,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (uint8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx);
}
@@ -23371,8 +23366,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (poly8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx);
}
@@ -23383,9 +23377,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl3_s8 (int8x16x3_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
}
@@ -23394,9 +23386,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl3_u8 (uint8x16x3_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
}
@@ -23405,9 +23395,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl3_p8 (poly8x16x3_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
}
@@ -23416,9 +23404,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl3q_s8 (int8x16x3_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
}
@@ -23427,9 +23413,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl3q_u8 (uint8x16x3_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
}
@@ -23438,9 +23422,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl3q_p8 (poly8x16x3_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
}
@@ -23451,10 +23433,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl4_s8 (int8x16x4_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
}
@@ -23463,10 +23442,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl4_u8 (uint8x16x4_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
}
@@ -23475,10 +23451,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl4_p8 (poly8x16x4_t __tab, uint8x8_t __idx)
{
__builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
}
@@ -23487,10 +23460,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl4q_s8 (int8x16x4_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
}
@@ -23499,10 +23469,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl4q_u8 (uint8x16x4_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
}
@@ -23511,10 +23478,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx)
{
__builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+ __builtin_memcpy (&__o, &__tab, sizeof (__tab));
return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
}
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
new file mode 100644
index 0000000000000000000000000000000000000000..2fab0f2947b7fa28e4e3a77bd365dcfdf30a9b28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
@@ -0,0 +1,45 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_TBL(name, rettype, tbltype, idxtype, ts) \
+ rettype test_ ## name ## _ ## ts (tbltype a, idxtype b) \
+ { \
+ return name ## _ ## ts (a, b); \
+ }
+
+TEST_TBL (vqtbl2, int8x8_t, int8x16x2_t, uint8x8_t, s8)
+TEST_TBL (vqtbl2, uint8x8_t, uint8x16x2_t, uint8x8_t, u8)
+TEST_TBL (vqtbl2, poly8x8_t, poly8x16x2_t, uint8x8_t, p8)
+
+TEST_TBL (vqtbl2q, int8x16_t, int8x16x2_t, uint8x16_t, s8)
+TEST_TBL (vqtbl2q, uint8x16_t, uint8x16x2_t, uint8x16_t, u8)
+TEST_TBL (vqtbl2q, poly8x16_t, poly8x16x2_t, uint8x16_t, p8)
+
+TEST_TBL (vqtbl4, int8x8_t, int8x16x4_t, uint8x8_t, s8)
+TEST_TBL (vqtbl4, uint8x8_t, uint8x16x4_t, uint8x8_t, u8)
+TEST_TBL (vqtbl4, poly8x8_t, poly8x16x4_t, uint8x8_t, p8)
+
+TEST_TBL (vqtbl4q, int8x16_t, int8x16x4_t, uint8x16_t, s8)
+TEST_TBL (vqtbl4q, uint8x16_t, uint8x16x4_t, uint8x16_t, u8)
+TEST_TBL (vqtbl4q, poly8x16_t, poly8x16x4_t, uint8x16_t, p8)
+
+#define TEST_TBL3(name, rettype, tbltype, idxtype, ts) \
+ rettype test_ ## name ## _ ## ts (idxtype a, tbltype b) \
+ { \
+ return name ## _ ## ts (b, a); \
+ }
+
+TEST_TBL3 (vqtbl3, int8x8_t, int8x16x3_t, uint8x8_t, s8)
+TEST_TBL3 (vqtbl3, uint8x8_t, uint8x16x3_t, uint8x8_t, u8)
+TEST_TBL3 (vqtbl3, poly8x8_t, poly8x16x3_t, uint8x8_t, p8)
+
+TEST_TBL3 (vqtbl3q, int8x16_t, int8x16x3_t, uint8x16_t, s8)
+TEST_TBL3 (vqtbl3q, uint8x16_t, uint8x16x3_t, uint8x16_t, u8)
+TEST_TBL3 (vqtbl3q, poly8x16_t, poly8x16x3_t, uint8x16_t, p8)
+
+/* { dg-final { scan-assembler-not "mov\\t" } } */
+
+/* { dg-final { scan-assembler-times "tbl\\t" 18} } */
^ permalink raw reply [flat|nested] 7+ messages in thread
* RE: [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics
2021-07-23 8:21 [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics Jonathan Wright
@ 2021-07-23 9:15 ` Kyrylo Tkachov
2021-08-03 9:42 ` Christophe Lyon
1 sibling, 0 replies; 7+ messages in thread
From: Kyrylo Tkachov @ 2021-07-23 9:15 UTC (permalink / raw)
To: Jonathan Wright, gcc-patches; +Cc: Richard Sandiford
Hi Jonathan,
> -----Original Message-----
> From: Jonathan Wright <Jonathan.Wright@arm.com>
> Sent: 23 July 2021 09:22
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [PATCH 1/8] aarch64: Use memcpy to copy vector tables in
> vqtbl[234] intrinsics
>
> Hi,
>
> This patch uses __builtin_memcpy to copy vector structures instead of
> building a new opaque structure one vector at a time in each of the
> vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
> and also improves code generation - superfluous move instructions
> were emitted for every register extraction/set in this additional
> structure.
>
> Add new code generation tests to verify that superfluous move
> instructions are no longer generated for the vqtbl[234] intrinsics.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
In the testcase:
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
new file mode 100644
index 0000000000000000000000000000000000000000..2fab0f2947b7fa28e4e3a77bd365dcfdf30a9b28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
@@ -0,0 +1,45 @@
+/* { dg-skip-if "" { arm*-*-* } } */
Files in gcc.target/aarch64 won't be attempted on arm* targets so the skip-if isn't needed (that's only for tests in gcc.target/aarch64/advsimd-intrinsics/).
Ok with that directive removed, thanks for doing this!
Kyrill
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-08 Jonathan Wright <jonathan.wright@arm.com>
>
> * config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
> instead of constructing __builtin_aarch64_simd_oi one vector
> at a time.
> (vqtbl2_u8): Likewise.
> (vqtbl2_p8): Likewise.
> (vqtbl2q_s8): Likewise.
> (vqtbl2q_u8): Likewise.
> (vqtbl2q_p8): Likewise.
> (vqtbl3_s8): Use __builtin_memcpy instead of constructing
> __builtin_aarch64_simd_ci one vector at a time.
> (vqtbl3_u8): Likewise.
> (vqtbl3_p8): Likewise.
> (vqtbl3q_s8): Likewise.
> (vqtbl3q_u8): Likewise.
> (vqtbl3q_p8): Likewise.
> (vqtbl4_s8): Use __builtin_memcpy instead of constructing
> __builtin_aarch64_simd_xi one vector at a time.
> (vqtbl4_u8): Likewise.
> (vqtbl4_p8): Likewise.
> (vqtbl4q_s8): Likewise.
> (vqtbl4q_u8): Likewise.
> (vqtbl4q_p8): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/vector_structure_intrinsics.c: New test.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics
2021-07-23 8:21 [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics Jonathan Wright
2021-07-23 9:15 ` Kyrylo Tkachov
@ 2021-08-03 9:42 ` Christophe Lyon
2021-08-04 10:05 ` [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian Jonathan Wright
1 sibling, 1 reply; 7+ messages in thread
From: Christophe Lyon @ 2021-08-03 9:42 UTC (permalink / raw)
To: Jonathan Wright; +Cc: gcc-patches, Richard Sandiford
On Fri, Jul 23, 2021 at 10:22 AM Jonathan Wright via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:
> Hi,
>
> This patch uses __builtin_memcpy to copy vector structures instead of
> building a new opaque structure one vector at a time in each of the
> vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
> and also improves code generation - superfluous move instructions
> were emitted for every register extraction/set in this additional
> structure.
>
> Add new code generation tests to verify that superfluous move
> instructions are no longer generated for the vqtbl[234] intrinsics.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-08 Jonathan Wright <jonathan.wright@arm.com>
>
> * config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
> instead of constructing __builtin_aarch64_simd_oi one vector
> at a time.
> (vqtbl2_u8): Likewise.
> (vqtbl2_p8): Likewise.
> (vqtbl2q_s8): Likewise.
> (vqtbl2q_u8): Likewise.
> (vqtbl2q_p8): Likewise.
> (vqtbl3_s8): Use __builtin_memcpy instead of constructing
> __builtin_aarch64_simd_ci one vector at a time.
> (vqtbl3_u8): Likewise.
> (vqtbl3_p8): Likewise.
> (vqtbl3q_s8): Likewise.
> (vqtbl3q_u8): Likewise.
> (vqtbl3q_p8): Likewise.
> (vqtbl4_s8): Use __builtin_memcpy instead of constructing
> __builtin_aarch64_simd_xi one vector at a time.
> (vqtbl4_u8): Likewise.
> (vqtbl4_p8): Likewise.
> (vqtbl4q_s8): Likewise.
> (vqtbl4q_u8): Likewise.
> (vqtbl4q_p8): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/vector_structure_intrinsics.c: New test.
>
Hi,
This new test fails on aarch64_be:
FAIL: gcc.target/aarch64/vector_structure_intrinsics.c scan-assembler-not
mov\\t
Can you check?
Thanks
Christophe
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian
2021-08-03 9:42 ` Christophe Lyon
@ 2021-08-04 10:05 ` Jonathan Wright
2021-08-06 12:24 ` Richard Sandiford
0 siblings, 1 reply; 7+ messages in thread
From: Jonathan Wright @ 2021-08-04 10:05 UTC (permalink / raw)
To: gcc-patches; +Cc: Richard Sandiford, Christophe Lyon
[-- Attachment #1: Type: text/plain, Size: 2929 bytes --]
Hi,
Recent refactoring of the arm_neon.h header enabled better code
generation for intrinsics that manipulate vector structures. New
tests were also added to verify the benefit of these changes. It now
transpires that the code generation improvements are observed only on
little-endian systems. This patch restricts the code generation tests
to little-endian targets (for now.)
Ok for master?
Thanks,
Jonathan
---
gcc/testsuite/ChangeLog:
2021-08-04 Jonathan Wright <jonathan.wright@arm.com>
* gcc.target/aarch64/vector_structure_intrinsics.c: Restrict
tests to little-endian targets.
From: Christophe Lyon <christophe.lyon.oss@gmail.com>
Sent: 03 August 2021 10:42
To: Jonathan Wright <Jonathan.Wright@arm.com>
Cc: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>; Richard Sandiford <Richard.Sandiford@arm.com>
Subject: Re: [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics
On Fri, Jul 23, 2021 at 10:22 AM Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
Hi,
This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.
Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vqtbl[234] intrinsics.
Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.
Ok for master?
Thanks,
Jonathan
---
gcc/ChangeLog:
2021-07-08 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vqtbl2_u8): Likewise.
(vqtbl2_p8): Likewise.
(vqtbl2q_s8): Likewise.
(vqtbl2q_u8): Likewise.
(vqtbl2q_p8): Likewise.
(vqtbl3_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_ci one vector at a time.
(vqtbl3_u8): Likewise.
(vqtbl3_p8): Likewise.
(vqtbl3q_s8): Likewise.
(vqtbl3q_u8): Likewise.
(vqtbl3q_p8): Likewise.
(vqtbl4_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_xi one vector at a time.
(vqtbl4_u8): Likewise.
(vqtbl4_p8): Likewise.
(vqtbl4q_s8): Likewise.
(vqtbl4q_u8): Likewise.
(vqtbl4q_p8): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vector_structure_intrinsics.c: New test.
Hi,
This new test fails on aarch64_be:
FAIL: gcc.target/aarch64/vector_structure_intrinsics.c scan-assembler-not mov\\t
Can you check?
Thanks
Christophe
[-- Attachment #2: rb14749.patch --]
[-- Type: application/octet-stream, Size: 826 bytes --]
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
index 60c53bc27f8378c78b119576ed19fde0e5743894..a8e31ab85d6fd2a045c8efaf2cbc42b5f40d2411 100644
--- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
@@ -197,7 +197,8 @@ TEST_ST1x3 (vst1q, uint64x2x3_t, uint64_t*, u64, x3);
TEST_ST1x3 (vst1q, poly64x2x3_t, poly64_t*, p64, x3);
TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
-/* { dg-final { scan-assembler-not "mov\\t" } } */
+/* { dg-final { scan-assembler-not {"mov\\t"} {
+ target { aarch64_little_endian } } ) } */
/* { dg-final { scan-assembler-times "tbl\\t" 18} } */
/* { dg-final { scan-assembler-times "tbx\\t" 18} } */
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian
2021-08-04 10:05 ` [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian Jonathan Wright
@ 2021-08-06 12:24 ` Richard Sandiford
2021-08-09 8:41 ` Jonathan Wright
0 siblings, 1 reply; 7+ messages in thread
From: Richard Sandiford @ 2021-08-06 12:24 UTC (permalink / raw)
To: Jonathan Wright; +Cc: gcc-patches, Christophe Lyon
Jonathan Wright <Jonathan.Wright@arm.com> writes:
> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> index 60c53bc27f8378c78b119576ed19fde0e5743894..a8e31ab85d6fd2a045c8efaf2cbc42b5f40d2411 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> @@ -197,7 +197,8 @@ TEST_ST1x3 (vst1q, uint64x2x3_t, uint64_t*, u64, x3);
> TEST_ST1x3 (vst1q, poly64x2x3_t, poly64_t*, p64, x3);
> TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
>
> -/* { dg-final { scan-assembler-not "mov\\t" } } */
> +/* { dg-final { scan-assembler-not {"mov\\t"} {
> + target { aarch64_little_endian } } ) } */
I think this needs to stay on line. We should also either keep the
original quoting on the regexp or use {mov\t}. Having both forms
of quote would turn it into a test for the characters:
"mov\t"
(including quotes and backslash).
Thanks,
Richard
>
> /* { dg-final { scan-assembler-times "tbl\\t" 18} } */
> /* { dg-final { scan-assembler-times "tbx\\t" 18} } */
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian
2021-08-06 12:24 ` Richard Sandiford
@ 2021-08-09 8:41 ` Jonathan Wright
2021-08-09 8:43 ` Richard Sandiford
0 siblings, 1 reply; 7+ messages in thread
From: Jonathan Wright @ 2021-08-09 8:41 UTC (permalink / raw)
To: gcc-patches; +Cc: Richard Sandiford
[-- Attachment #1: Type: text/plain, Size: 1835 bytes --]
Hi,
I've corrected the quoting and moved everything on to one line.
Ok for master?
Thanks,
Jonathan
---
gcc/testsuite/ChangeLog:
2021-08-04 Jonathan Wright <jonathan.wright@arm.com>
* gcc.target/aarch64/vector_structure_intrinsics.c: Restrict
tests to little-endian targets.
From: Richard Sandiford <richard.sandiford@arm.com>
Sent: 06 August 2021 13:24
To: Jonathan Wright <Jonathan.Wright@arm.com>
Cc: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>; Christophe Lyon <christophe.lyon.oss@gmail.com>
Subject: Re: [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian
Jonathan Wright <Jonathan.Wright@arm.com> writes:
> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> index 60c53bc27f8378c78b119576ed19fde0e5743894..a8e31ab85d6fd2a045c8efaf2cbc42b5f40d2411 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> @@ -197,7 +197,8 @@ TEST_ST1x3 (vst1q, uint64x2x3_t, uint64_t*, u64, x3);
> TEST_ST1x3 (vst1q, poly64x2x3_t, poly64_t*, p64, x3);
> TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
>
> -/* { dg-final { scan-assembler-not "mov\\t" } } */
> +/* { dg-final { scan-assembler-not {"mov\\t"} {
> + target { aarch64_little_endian } } ) } */
I think this needs to stay on line. We should also either keep the
original quoting on the regexp or use {mov\t}. Having both forms
of quote would turn it into a test for the characters:
"mov\t"
(including quotes and backslash).
Thanks,
Richard
>
> /* { dg-final { scan-assembler-times "tbl\\t" 18} } */
> /* { dg-final { scan-assembler-times "tbx\\t" 18} } */
[-- Attachment #2: rb14749.patch --]
[-- Type: application/octet-stream, Size: 816 bytes --]
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
index 3e7e572bf39659ecf2f17751d92a4a99a4f2bf8b..89e9de18a92dbc00e58261e4558b3cff38c7ca75 100644
--- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
@@ -263,7 +263,7 @@ TEST_ST1x3 (vst1q, uint64x2x3_t, uint64_t*, u64, x3);
TEST_ST1x3 (vst1q, poly64x2x3_t, poly64_t*, p64, x3);
TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
-/* { dg-final { scan-assembler-not "mov\\t" } } */
+/* { dg-final { scan-assembler-not "mov\\t" { target aarch64_little_endian } } } */
/* { dg-final { scan-assembler-times "tbl\\t" 18} } */
/* { dg-final { scan-assembler-times "tbx\\t" 18} } */
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian
2021-08-09 8:41 ` Jonathan Wright
@ 2021-08-09 8:43 ` Richard Sandiford
0 siblings, 0 replies; 7+ messages in thread
From: Richard Sandiford @ 2021-08-09 8:43 UTC (permalink / raw)
To: Jonathan Wright; +Cc: gcc-patches
Jonathan Wright <Jonathan.Wright@arm.com> writes:
> Hi,
>
> I've corrected the quoting and moved everything on to one line.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/testsuite/ChangeLog:
>
> 2021-08-04 Jonathan Wright <jonathan.wright@arm.com>
>
> * gcc.target/aarch64/vector_structure_intrinsics.c: Restrict
> tests to little-endian targets.
OK, thanks.
Richard
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: 06 August 2021 13:24
> To: Jonathan Wright <Jonathan.Wright@arm.com>
> Cc: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>; Christophe Lyon <christophe.lyon.oss@gmail.com>
> Subject: Re: [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian
>
> Jonathan Wright <Jonathan.Wright@arm.com> writes:
>> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
>> index 60c53bc27f8378c78b119576ed19fde0e5743894..a8e31ab85d6fd2a045c8efaf2cbc42b5f40d2411 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
>> @@ -197,7 +197,8 @@ TEST_ST1x3 (vst1q, uint64x2x3_t, uint64_t*, u64, x3);
>> TEST_ST1x3 (vst1q, poly64x2x3_t, poly64_t*, p64, x3);
>> TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
>>
>> -/* { dg-final { scan-assembler-not "mov\\t" } } */
>> +/* { dg-final { scan-assembler-not {"mov\\t"} {
>> + target { aarch64_little_endian } } ) } */
>
> I think this needs to stay on line. We should also either keep the
> original quoting on the regexp or use {mov\t}. Having both forms
> of quote would turn it into a test for the characters:
>
> "mov\t"
>
> (including quotes and backslash).
>
> Thanks,
> Richard
>
>
>>
>> /* { dg-final { scan-assembler-times "tbl\\t" 18} } */
>> /* { dg-final { scan-assembler-times "tbx\\t" 18} } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> index 3e7e572bf39659ecf2f17751d92a4a99a4f2bf8b..89e9de18a92dbc00e58261e4558b3cff38c7ca75 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> @@ -263,7 +263,7 @@ TEST_ST1x3 (vst1q, uint64x2x3_t, uint64_t*, u64, x3);
> TEST_ST1x3 (vst1q, poly64x2x3_t, poly64_t*, p64, x3);
> TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
>
> -/* { dg-final { scan-assembler-not "mov\\t" } } */
> +/* { dg-final { scan-assembler-not "mov\\t" { target aarch64_little_endian } } } */
>
> /* { dg-final { scan-assembler-times "tbl\\t" 18} } */
> /* { dg-final { scan-assembler-times "tbx\\t" 18} } */
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-08-09 8:43 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-23 8:21 [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics Jonathan Wright
2021-07-23 9:15 ` Kyrylo Tkachov
2021-08-03 9:42 ` Christophe Lyon
2021-08-04 10:05 ` [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian Jonathan Wright
2021-08-06 12:24 ` Richard Sandiford
2021-08-09 8:41 ` Jonathan Wright
2021-08-09 8:43 ` Richard Sandiford
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).