public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics
@ 2021-07-23  8:21 Jonathan Wright
  2021-07-23  9:15 ` Kyrylo Tkachov
  2021-08-03  9:42 ` Christophe Lyon
  0 siblings, 2 replies; 7+ messages in thread
From: Jonathan Wright @ 2021-07-23  8:21 UTC (permalink / raw)
  To: gcc-patches; +Cc: Kyrylo Tkachov, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 1542 bytes --]

Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vqtbl[234] intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-08  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
	instead of constructing __builtin_aarch64_simd_oi one vector
	at a time.
	(vqtbl2_u8): Likewise.
	(vqtbl2_p8): Likewise.
	(vqtbl2q_s8): Likewise.
	(vqtbl2q_u8): Likewise.
	(vqtbl2q_p8): Likewise.
	(vqtbl3_s8): Use __builtin_memcpy instead of constructing
	__builtin_aarch64_simd_ci one vector at a time.
	(vqtbl3_u8): Likewise.
	(vqtbl3_p8): Likewise.
	(vqtbl3q_s8): Likewise.
	(vqtbl3q_u8): Likewise.
	(vqtbl3q_p8): Likewise.
	(vqtbl4_s8): Use __builtin_memcpy instead of constructing
	__builtin_aarch64_simd_xi one vector at a time.
	(vqtbl4_u8): Likewise.
	(vqtbl4_p8): Likewise.
	(vqtbl4q_s8): Likewise.
	(vqtbl4q_u8): Likewise.
	(vqtbl4q_p8): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vector_structure_intrinsics.c: New test.

[-- Attachment #2: rb14639.patch --]
[-- Type: application/octet-stream, Size: 11961 bytes --]

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 1048d7c7eaac14554142eaa7544159a50929b7f1..31ae86e6d25239359045d649bf8d00d8c0fa9212 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -23321,8 +23321,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return __builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23331,8 +23330,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23341,8 +23339,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23351,8 +23348,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return __builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23361,8 +23357,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (uint8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23371,8 +23366,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (poly8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23383,9 +23377,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl3_s8 (int8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23394,9 +23386,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl3_u8 (uint8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23405,9 +23395,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl3_p8 (poly8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23416,9 +23404,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl3q_s8 (int8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23427,9 +23413,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl3q_u8 (uint8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23438,9 +23422,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl3q_p8 (poly8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23451,10 +23433,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl4_s8 (int8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23463,10 +23442,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl4_u8 (uint8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23475,10 +23451,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl4_p8 (poly8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
 }
 
@@ -23487,10 +23460,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl4q_s8 (int8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23499,10 +23469,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl4q_u8 (uint8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
 }
 
@@ -23511,10 +23478,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  __builtin_memcpy (&__o, &__tab, sizeof (__tab));
   return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
new file mode 100644
index 0000000000000000000000000000000000000000..2fab0f2947b7fa28e4e3a77bd365dcfdf30a9b28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
@@ -0,0 +1,45 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_TBL(name, rettype, tbltype, idxtype, ts) \
+  rettype test_ ## name ## _ ## ts (tbltype a, idxtype b) \
+	{ \
+		return name ## _ ## ts (a, b); \
+	}
+
+TEST_TBL (vqtbl2, int8x8_t, int8x16x2_t, uint8x8_t, s8)
+TEST_TBL (vqtbl2, uint8x8_t, uint8x16x2_t, uint8x8_t, u8)
+TEST_TBL (vqtbl2, poly8x8_t, poly8x16x2_t, uint8x8_t, p8)
+
+TEST_TBL (vqtbl2q, int8x16_t, int8x16x2_t, uint8x16_t, s8)
+TEST_TBL (vqtbl2q, uint8x16_t, uint8x16x2_t, uint8x16_t, u8)
+TEST_TBL (vqtbl2q, poly8x16_t, poly8x16x2_t, uint8x16_t, p8)
+
+TEST_TBL (vqtbl4, int8x8_t, int8x16x4_t, uint8x8_t, s8)
+TEST_TBL (vqtbl4, uint8x8_t, uint8x16x4_t, uint8x8_t, u8)
+TEST_TBL (vqtbl4, poly8x8_t, poly8x16x4_t, uint8x8_t, p8)
+
+TEST_TBL (vqtbl4q, int8x16_t, int8x16x4_t, uint8x16_t, s8)
+TEST_TBL (vqtbl4q, uint8x16_t, uint8x16x4_t, uint8x16_t, u8)
+TEST_TBL (vqtbl4q, poly8x16_t, poly8x16x4_t, uint8x16_t, p8)
+
+#define TEST_TBL3(name, rettype, tbltype, idxtype, ts) \
+  rettype test_ ## name ## _ ## ts (idxtype a, tbltype b) \
+	{ \
+		return name ## _ ## ts (b, a); \
+	}
+
+TEST_TBL3 (vqtbl3, int8x8_t, int8x16x3_t, uint8x8_t, s8)
+TEST_TBL3 (vqtbl3, uint8x8_t, uint8x16x3_t, uint8x8_t, u8)
+TEST_TBL3 (vqtbl3, poly8x8_t, poly8x16x3_t, uint8x8_t, p8)
+
+TEST_TBL3 (vqtbl3q, int8x16_t, int8x16x3_t, uint8x16_t, s8)
+TEST_TBL3 (vqtbl3q, uint8x16_t, uint8x16x3_t, uint8x16_t, u8)
+TEST_TBL3 (vqtbl3q, poly8x16_t, poly8x16x3_t, uint8x16_t, p8)
+
+/* { dg-final { scan-assembler-not "mov\\t" } } */
+
+/* { dg-final { scan-assembler-times "tbl\\t" 18} }  */

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-08-09  8:43 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-23  8:21 [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics Jonathan Wright
2021-07-23  9:15 ` Kyrylo Tkachov
2021-08-03  9:42 ` Christophe Lyon
2021-08-04 10:05   ` [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian Jonathan Wright
2021-08-06 12:24     ` Richard Sandiford
2021-08-09  8:41       ` Jonathan Wright
2021-08-09  8:43         ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).