[PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*
  2014-09-18 19:40 [PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics Charles Baylis
@ 2014-09-18 19:40 ` Charles Baylis
  2014-09-19 11:21   ` Tejas Belagod
  2014-09-18 19:41 ` [PATCH 3/4] [AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics Charles Baylis
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 12+ messages in thread
From: Charles Baylis @ 2014-09-18 19:40 UTC (permalink / raw)
  To: marcus.shawcroft, rearnsha, gcc-patches

This patch replaces the inline assembler implementations of the
vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
functions added in patch 1.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
	update uses to use new macro arguments.
	(__LD3_LANE_FUNC): Likewise.
	(__LD4_LANE_FUNC): Likewise.

Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
---
 gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++--------------
 1 file changed, 237 insertions(+), 122 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index e62c783..c1fcb47 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
 __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
 __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
 
-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"	\
-	     "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17");					\
-    return result;							\
-  }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_oi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] = 							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o =	__builtin_aarch64_ld2_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
+  return __b;								   \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)	   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  union { intype __i;							   \
+	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		   \
+  __temp.__o = __builtin_aarch64_ld2_lane##mode (			   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);	   \
+  return __temp.__i;							   \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
 
 #define __LD3R_FUNC(rettype, structtype, ptrtype,			\
 		    regsuffix, funcsuffix, Q)				\
@@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
 __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
 __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
 
-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"	\
-	     "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17", "v18");				\
-    return result;							\
-  }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_ci __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] = 							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[2],	   \
+					   2);				   \
+  __o =	__builtin_aarch64_ld3_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
+  return __b;								   \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)	   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  union { intype __i;							   \
+	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		   \
+  __temp.__o = __builtin_aarch64_ld4_lane##mode (			   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);	   \
+  return __temp.__i;							   \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
 
 #define __LD4R_FUNC(rettype, structtype, ptrtype,			\
 		    regsuffix, funcsuffix, Q)				\
@@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
 __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
 __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
 
-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"	\
-	     "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17", "v18", "v19");			\
-    return result;							\
-  }
 
-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_xi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] = 							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __temp.val[3] =							   \
+    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[2],	   \
+					   2);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[3],	   \
+					   3);				   \
+  __o =	__builtin_aarch64_ld4_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
+  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
+  return __b;								   \
+}
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)	   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  union { intype __i;							   \
+	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		   \
+  __temp.__o = __builtin_aarch64_ld4_lane##mode (			   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);	   \
+  return __temp.__i;							   \
+}
+
+__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
 
 #define __ST2_LANE_FUNC(intype, largetype, ptrtype,			     \
 			mode, ptr_mode, funcsuffix, signedtype)		     \
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics
@ 2014-09-18 19:40 Charles Baylis
  2014-09-18 19:40 ` [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_* Charles Baylis
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Charles Baylis @ 2014-09-18 19:40 UTC (permalink / raw)
  To: marcus.shawcroft, rearnsha, gcc-patches

This patch series improves the code generation for NEON structure loads and
stores.

Tested with make check on aarch64-oe-linux with qemu, and also passes clyon's
NEON intrinsics tests.

Charles Baylis (4):
  [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_*
    intrinsics
  [AARCH64,NEON] Convert arm_neon.h to use new builtins for
    vld[234](q?)_lane_*
  [AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics
  [AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics

 gcc/config/aarch64/aarch64-builtins.c        |    5 +
 gcc/config/aarch64/aarch64-simd-builtins.def |    4 +
 gcc/config/aarch64/aarch64-simd.md           |   95 +++
 gcc/config/aarch64/aarch64.md                |    3 +
 gcc/config/aarch64/arm_neon.h                | 1043 ++++++++++++--------------
 5 files changed, 596 insertions(+), 554 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics
  2014-09-18 19:40 [PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics Charles Baylis
                   ` (2 preceding siblings ...)
  2014-09-18 19:41 ` [PATCH 4/4] [AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics Charles Baylis
@ 2014-09-18 19:41 ` Charles Baylis
  2014-09-19  8:40   ` Kyrill Tkachov
  2014-09-19 10:46   ` Tejas Belagod
  3 siblings, 2 replies; 12+ messages in thread
From: Charles Baylis @ 2014-09-18 19:41 UTC (permalink / raw)
  To: marcus.shawcroft, rearnsha, gcc-patches

This patch adds new patterns and builtins to represent single lane structure
loads instructions, which will be used to implement the vld[234](q?)_lane_*
intrinsics.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>
	* config/aarch64/aarch64-builtins.c
	(aarch64_types_loadstruct_lane_qualifiers): Define.
	* config/aarch64/aarch64-simd-builtins.def (ld2_lane, ld3_lane,
	ld4_lane): New builtins.
	* config/aarch64/aarch64-simd.md (vec_load_lanesoi_lane<mode>): New
	pattern.
	(vec_load_lanesci_lane<mode>): Likewise.
	(vec_load_lanesxi_lane<mode>): Likewise.
	(aarch64_ld2_lane<VQ:mode>): New expand.
	(aarch64_ld3_lane<VQ:mode>): Likewise.
	(aarch64_ld4_lane<VQ:mode>): Likewise.

Change-Id: I205ab46aa3f3f2486cc163b93e1da080a87c3419
---
 gcc/config/aarch64/aarch64-builtins.c        |  5 ++
 gcc/config/aarch64/aarch64-simd-builtins.def |  4 ++
 gcc/config/aarch64/aarch64-simd.md           | 95 ++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.md                |  3 +
 4 files changed, 107 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 395b4ec..818729c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -201,6 +201,11 @@ aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_const_pointer_map_mode };
 #define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
 #define TYPES_LOADSTRUCT (aarch64_types_load1_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_loadstruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_const_pointer_map_mode,
+      qualifier_none, qualifier_none };
+#define TYPES_LOADSTRUCT_LANE (aarch64_types_loadstruct_lane_qualifiers)
 
 static enum aarch64_type_qualifiers
 aarch64_types_bsl_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index de264c4..5d3e122 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -83,6 +83,10 @@
   BUILTIN_VQ (LOADSTRUCT, ld2, 0)
   BUILTIN_VQ (LOADSTRUCT, ld3, 0)
   BUILTIN_VQ (LOADSTRUCT, ld4, 0)
+  /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
+  BUILTIN_VQ (LOADSTRUCT_LANE, ld2_lane, 0)
+  BUILTIN_VQ (LOADSTRUCT_LANE, ld3_lane, 0)
+  BUILTIN_VQ (LOADSTRUCT_LANE, ld4_lane, 0)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
   BUILTIN_VDC (STORESTRUCT, st2, 0)
   BUILTIN_VDC (STORESTRUCT, st3, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 493e886..f6c4018 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4003,6 +4003,18 @@
   [(set_attr "type" "neon_load2_2reg<q>")]
 )
 
+(define_insn "vec_load_lanesoi_lane<mode>"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+	(unspec:OI [(match_operand:<V_TWO_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+		    (match_operand:OI 2 "register_operand" "0")
+		    (match_operand:SI 3 "immediate_operand" "i")
+		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
+		   UNSPEC_LD2_LANE))]
+  "TARGET_SIMD"
+  "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"
+  [(set_attr "type" "neon_load2_one_lane<q>")]
+)
+
 (define_insn "vec_store_lanesoi<mode>"
   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
 	(unspec:OI [(match_operand:OI 1 "register_operand" "w")
@@ -4034,6 +4046,18 @@
   [(set_attr "type" "neon_load3_3reg<q>")]
 )
 
+(define_insn "vec_load_lanesci_lane<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+	(unspec:CI [(match_operand:<V_THREE_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+		    (match_operand:CI 2 "register_operand" "0")
+		    (match_operand:SI 3 "immediate_operand" "i")
+		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		   UNSPEC_LD3_LANE))]
+  "TARGET_SIMD"
+  "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"
+  [(set_attr "type" "neon_load3_one_lane<q>")]
+)
+
 (define_insn "vec_store_lanesci<mode>"
   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
 	(unspec:CI [(match_operand:CI 1 "register_operand" "w")
@@ -4065,6 +4089,18 @@
   [(set_attr "type" "neon_load4_4reg<q>")]
 )
 
+(define_insn "vec_load_lanesxi_lane<mode>"
+  [(set (match_operand:XI 0 "register_operand" "=w")
+	(unspec:XI [(match_operand:<V_FOUR_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+		    (match_operand:XI 2 "register_operand" "0")
+		    (match_operand:SI 3 "immediate_operand" "i")
+		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		   UNSPEC_LD4_LANE))]
+  "TARGET_SIMD"
+  "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"
+  [(set_attr "type" "neon_load4_one_lane<q>")]
+)
+
 (define_insn "vec_store_lanesxi<mode>"
   [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
 	(unspec:XI [(match_operand:XI 1 "register_operand" "w")
@@ -4378,6 +4414,65 @@
   DONE;
 })
 
+(define_expand "aarch64_ld2_lane<VQ:mode>"
+  [(match_operand:OI 0 "register_operand" "=w")
+	(match_operand:DI 1 "register_operand" "w")
+	(match_operand:OI 2 "register_operand" "0")
+	(match_operand:SI 3 "immediate_operand" "i")
+	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <V_TWO_ELEM>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+  emit_insn (gen_vec_load_lanesoi_lane<VQ:mode> (operands[0],
+						  mem,
+						  operands[2],
+						  operands[3]));
+  DONE;
+})
+
+(define_expand "aarch64_ld3_lane<VQ:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+	(match_operand:DI 1 "register_operand" "w")
+	(match_operand:CI 2 "register_operand" "0")
+	(match_operand:SI 3 "immediate_operand" "i")
+	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <V_THREE_ELEM>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+  emit_insn (gen_vec_load_lanesci_lane<VQ:mode> (operands[0],
+						  mem,
+						  operands[2],
+						  operands[3]));
+  DONE;
+})
+
+(define_expand "aarch64_ld4_lane<VQ:mode>"
+  [(match_operand:XI 0 "register_operand" "=w")
+	(match_operand:DI 1 "register_operand" "w")
+	(match_operand:XI 2 "register_operand" "0")
+	(match_operand:SI 3 "immediate_operand" "i")
+	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <V_FOUR_ELEM>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+  emit_insn (gen_vec_load_lanesxi_lane<VQ:mode> (operands[0],
+						  mem,
+						  operands[2],
+						  operands[3]));
+  DONE;
+})
+
+
+
 ;; Expanders for builtins to extract vector registers from large
 ;; opaque integer modes.
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c60038a..ea924ab 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -92,6 +92,9 @@
     UNSPEC_LD2
     UNSPEC_LD3
     UNSPEC_LD4
+    UNSPEC_LD2_LANE
+    UNSPEC_LD3_LANE
+    UNSPEC_LD4_LANE
     UNSPEC_MB
     UNSPEC_NOP
     UNSPEC_PRLG_STK
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 4/4] [AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics
  2014-09-18 19:40 [PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics Charles Baylis
  2014-09-18 19:40 ` [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_* Charles Baylis
  2014-09-18 19:41 ` [PATCH 3/4] [AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics Charles Baylis
@ 2014-09-18 19:41 ` Charles Baylis
  2014-09-18 19:41 ` [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics Charles Baylis
  3 siblings, 0 replies; 12+ messages in thread
From: Charles Baylis @ 2014-09-18 19:41 UTC (permalink / raw)
  To: marcus.shawcroft, rearnsha, gcc-patches

This patch improves code generation of vst[234]q_* intrinsics by avoiding use
of the __builtin_aarch64_set_qreg_* builtins to generate a temporary
__builtin_aarch64_simd_XX variable. Instead, a union is used for type-punning,
which avoids generation of some unnecessary move instructions. This idiom is
already used in several other intrinsics.

This patch is independent of the previous patches in the series.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (vst2q_s8, vst2q_p8, vst2q_s16, vst2q_p16,
	vst2q_s32, vst2q_s64, vst2q_u8, vst2q_u16, vst2q_u32, vst2q_u64,
	vst2q_f32, vst2q_f64, vst3q_s8, vst3q_p8, vst3q_s16, vst3q_p16,
	vst3q_s32, vst3q_s64, vst3q_u8, vst3q_u16, vst3q_u32, vst3q_u64,
	vst3q_f32, vst3q_f64, vst4q_s8, vst4q_p8, vst4q_s16, vst4q_p16,
	vst4q_s32, vst4q_s64, vst4q_u8, vst4q_u16, vst4q_u32, vst4q_u64,
	vst4q_f32, vst4q_f64): Use type-punning to convert between NEON
	intrinsic types and __builtin_aarch64_simd* types.

Change-Id: I789c68fc8d9458638eb00a15ffa28073bdc969a8
---
 gcc/config/aarch64/arm_neon.h | 288 ++++++++++++++++--------------------------
 1 file changed, 108 insertions(+), 180 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 87e3baf..3292ce0 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -22493,109 +22493,97 @@ vst2_f32 (float32_t * __a, float32x2x2_t val)
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s8 (int8_t * __a, int8x16x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { poly8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s16 (int16_t * __a, int16x8x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { int16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s32 (int32_t * __a, int32x4x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { int32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s64 (int64_t * __a, int64x2x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { uint8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { uint64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f32 (float32_t * __a, float32x4x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { float32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f64 (float64_t * __a, float64x2x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { float64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
 }
 
 __extension__ static __inline void
@@ -22769,121 +22757,97 @@ vst3_f32 (float32_t * __a, float32x2x3_t val)
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s8 (int8_t * __a, int8x16x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { poly8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s16 (int16_t * __a, int16x8x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { int16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s32 (int32_t * __a, int32x4x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { int32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s64 (int64_t * __a, int64x2x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { uint8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { uint64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f32 (float32_t * __a, float32x4x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { float32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f64 (float64_t * __a, float64x2x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { float64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
 }
 
 __extension__ static __inline void
@@ -23081,133 +23045,97 @@ vst4_f32 (float32_t * __a, float32x2x4_t val)
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s8 (int8_t * __a, int8x16x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { poly8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s16 (int16_t * __a, int16x8x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { int16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s32 (int32_t * __a, int32x4x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { int32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s64 (int64_t * __a, int64x2x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { uint8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { uint64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f32 (float32_t * __a, float32x4x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { float32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f64 (float64_t * __a, float64x2x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { float64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
 }
 
 /* vsub */
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/4] [AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics
  2014-09-18 19:40 [PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics Charles Baylis
  2014-09-18 19:40 ` [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_* Charles Baylis
@ 2014-09-18 19:41 ` Charles Baylis
  2014-09-18 19:41 ` [PATCH 4/4] [AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics Charles Baylis
  2014-09-18 19:41 ` [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics Charles Baylis
  3 siblings, 0 replies; 12+ messages in thread
From: Charles Baylis @ 2014-09-18 19:41 UTC (permalink / raw)
  To: marcus.shawcroft, rearnsha, gcc-patches

This patch improves code generation of vld[234]q_* intrinsics by avoiding use
of the __builtin_aarch64_get_qreg_* builtins to generate a temporary result
variable. Instead, a union is used for type-punning, which avoids generation of
some unnecessary move instructions. This idiom is already used in several other
intrinsics.

This patch is independent of the previous patches in the series.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (vld2q_s8, vld2q_p8, vld2q_s16, vld2q_p16,
	vld2q_s32, vld2q_s64, vld2q_u8, vld2q_u16, vld2q_u32, vld2q_u64,
	vld2q_f32, vld2q_f64, vld3q_s8, vld3q_p8, vld3q_s16, vld3q_p16,
	vld3q_s32, vld3q_s64, vld3q_u8, vld3q_u16, vld3q_u32, vld3q_u64,
	vld3q_f32, vld3q_f64, vld4q_s8, vld4q_p8, vld4q_s16, vld4q_p16,
	vld4q_s32, vld4q_s64, vld4q_u8, vld4q_u16, vld4q_u32, vld4q_u64,
	vld4q_f32, vld4q_f64): Use type-punning to convert between NEON
	intrinsic types and __builtin_aarch64_simd* types.

Change-Id: I61efa29138b13c7a83679885343211d604a73b15
---
 gcc/config/aarch64/arm_neon.h | 396 +++++++++++++++---------------------------
 1 file changed, 144 insertions(+), 252 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c1fcb47..87e3baf 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -16969,133 +16969,109 @@ vld2_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
 vld2q_s8 (const int8_t * __a)
 {
-  int8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { int8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
 vld2q_p8 (const poly8_t * __a)
 {
-  poly8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { poly8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
 vld2q_s16 (const int16_t * __a)
 {
-  int16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { int16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
 vld2q_p16 (const poly16_t * __a)
 {
-  poly16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { poly16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
 vld2q_s32 (const int32_t * __a)
 {
-  int32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  union { int32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
 vld2q_s64 (const int64_t * __a)
 {
-  int64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  union { int64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
 vld2q_u8 (const uint8_t * __a)
 {
-  uint8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { uint8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
 vld2q_u16 (const uint16_t * __a)
 {
-  uint16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { uint16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
 vld2q_u32 (const uint32_t * __a)
 {
-  uint32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  union { uint32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
 vld2q_u64 (const uint64_t * __a)
 {
-  uint64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  union { uint64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_f32 (const float32_t * __a)
 {
-  float32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
-  return ret;
+  union { float32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
 vld2q_f64 (const float64_t * __a)
 {
-  float64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
-  return ret;
+  union { float64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
@@ -17245,145 +17221,109 @@ vld3_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
 vld3q_s8 (const int8_t * __a)
 {
-  int8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { int8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
 vld3q_p8 (const poly8_t * __a)
 {
-  poly8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { poly8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
 vld3q_s16 (const int16_t * __a)
 {
-  int16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { int16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
 vld3q_p16 (const poly16_t * __a)
 {
-  poly16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { poly16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
 vld3q_s32 (const int32_t * __a)
 {
-  int32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  union { int32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
 vld3q_s64 (const int64_t * __a)
 {
-  int64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  union { int64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
 vld3q_u8 (const uint8_t * __a)
 {
-  uint8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { uint8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
 vld3q_u16 (const uint16_t * __a)
 {
-  uint16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { uint16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
 vld3q_u32 (const uint32_t * __a)
 {
-  uint32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  union { uint32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
 vld3q_u64 (const uint64_t * __a)
 {
-  uint64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  union { uint64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_f32 (const float32_t * __a)
 {
-  float32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
-  return ret;
+  union { float32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
 vld3q_f64 (const float64_t * __a)
 {
-  float64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
-  return ret;
+  union { float64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
@@ -17545,157 +17485,109 @@ vld4_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
 vld4q_s8 (const int8_t * __a)
 {
-  int8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { int8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
 vld4q_p8 (const poly8_t * __a)
 {
-  poly8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { poly8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
 vld4q_s16 (const int16_t * __a)
 {
-  int16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { int16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
 vld4q_p16 (const poly16_t * __a)
 {
-  poly16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { poly16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
 vld4q_s32 (const int32_t * __a)
 {
-  int32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  union { int32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
 vld4q_s64 (const int64_t * __a)
 {
-  int64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  union { int64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
 vld4q_u8 (const uint8_t * __a)
 {
-  uint8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { uint8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
 vld4q_u16 (const uint16_t * __a)
 {
-  uint16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { uint16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
 vld4q_u32 (const uint32_t * __a)
 {
-  uint32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  union { uint32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
 vld4q_u64 (const uint64_t * __a)
 {
-  uint64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  union { uint64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_f32 (const float32_t * __a)
 {
-  float32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
-  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
-  return ret;
+  union { float32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
 vld4q_f64 (const float64_t * __a)
 {
-  float64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
-  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
-  return ret;
+  union { float64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 /* vmax */
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics
  2014-09-18 19:41 ` [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics Charles Baylis
@ 2014-09-19  8:40   ` Kyrill Tkachov
  2014-09-19 10:46   ` Tejas Belagod
  1 sibling, 0 replies; 12+ messages in thread
From: Kyrill Tkachov @ 2014-09-19  8:40 UTC (permalink / raw)
  To: Charles Baylis, Marcus Shawcroft, Richard Earnshaw, gcc-patches

Hi Charles,

Good to see these intrinsics being brought into the modern world :)
Some style comments inline.

On 18/09/14 20:38, Charles Baylis wrote:
> This patch adds new patterns and builtins to represent single lane structure
> loads instructions, which will be used to implement the vld[234](q?)_lane_*
> intrinsics.
>
> Tested (with the rest of the patch series) with make check on aarch64-oe-linux
> with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
>
> <DATE>  Charles Baylis  <charles.baylis@linaro.org>
> 	* config/aarch64/aarch64-builtins.c
> 	(aarch64_types_loadstruct_lane_qualifiers): Define.
> 	* config/aarch64/aarch64-simd-builtins.def (ld2_lane, ld3_lane,
> 	ld4_lane): New builtins.
> 	* config/aarch64/aarch64-simd.md (vec_load_lanesoi_lane<mode>): New
> 	pattern.
> 	(vec_load_lanesci_lane<mode>): Likewise.
> 	(vec_load_lanesxi_lane<mode>): Likewise.
> 	(aarch64_ld2_lane<VQ:mode>): New expand.
> 	(aarch64_ld3_lane<VQ:mode>): Likewise.
> 	(aarch64_ld4_lane<VQ:mode>): Likewise.

This is missing an entry for the config/aarch64/aarch64.md hunk.

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 493e886..f6c4018 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4003,6 +4003,18 @@
>     [(set_attr "type" "neon_load2_2reg<q>")]
>   )
>   
> +(define_insn "vec_load_lanesoi_lane<mode>"
> +  [(set (match_operand:OI 0 "register_operand" "=w")
> +	(unspec:OI [(match_operand:<V_TWO_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
> +		    (match_operand:OI 2 "register_operand" "0")
> +		    (match_operand:SI 3 "immediate_operand" "i")
> +		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
> +		   UNSPEC_LD2_LANE))]
> +  "TARGET_SIMD"
> +  "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"
> +  [(set_attr "type" "neon_load2_one_lane<q>")]
> +)

The VQ mode iterator goes over the 128-wide modes so the "type" 
attribute here will always be neon_load2_one_lane_q. Using the <q> mode 
attribute is still correct but personally I think it makes it just that 
little bit harder to figure out for a newbie who will have to open 
iterators.md to figure out the meaning of it, or for someone who's not 
sure whether the 'q' is added with an underscore or without. I would 
just use neon_load2_one_lane_q.

>   
> +(define_insn "vec_load_lanesci_lane<mode>"
> +  [(set (match_operand:CI 0 "register_operand" "=w")
> +	(unspec:CI [(match_operand:<V_THREE_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
> +		    (match_operand:CI 2 "register_operand" "0")
> +		    (match_operand:SI 3 "immediate_operand" "i")
> +		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +		   UNSPEC_LD3_LANE))]
> +  "TARGET_SIMD"
> +  "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"
> +  [(set_attr "type" "neon_load3_one_lane<q>")]
> +)

Likewise.

>   
> +(define_insn "vec_load_lanesxi_lane<mode>"
> +  [(set (match_operand:XI 0 "register_operand" "=w")
> +	(unspec:XI [(match_operand:<V_FOUR_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
> +		    (match_operand:XI 2 "register_operand" "0")
> +		    (match_operand:SI 3 "immediate_operand" "i")
> +		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +		   UNSPEC_LD4_LANE))]
> +  "TARGET_SIMD"
> +  "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"
> +  [(set_attr "type" "neon_load4_one_lane<q>")]
> +)

Same here.

>   
> +(define_expand "aarch64_ld2_lane<VQ:mode>"
> +  [(match_operand:OI 0 "register_operand" "=w")
> +	(match_operand:DI 1 "register_operand" "w")
> +	(match_operand:OI 2 "register_operand" "0")
> +	(match_operand:SI 3 "immediate_operand" "i")
> +	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_SIMD"
> +{
> +  enum machine_mode mode = <V_TWO_ELEM>mode;
> +  rtx mem = gen_rtx_MEM (mode, operands[1]);
> +  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
> +
> +  emit_insn (gen_vec_load_lanesoi_lane<VQ:mode> (operands[0],
> +						  mem,
> +						  operands[2],
> +						  operands[3]));
> +  DONE;
> +})

I think saying <VQ:mode> is redundant since VQ is the only mode iterator 
in the pattern.
Just <mode> should work, right?

> +
> +(define_expand "aarch64_ld3_lane<VQ:mode>"
> +  [(match_operand:CI 0 "register_operand" "=w")
> +	(match_operand:DI 1 "register_operand" "w")
> +	(match_operand:CI 2 "register_operand" "0")
> +	(match_operand:SI 3 "immediate_operand" "i")
> +	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_SIMD"
> +{
> +  enum machine_mode mode = <V_THREE_ELEM>mode;
> +  rtx mem = gen_rtx_MEM (mode, operands[1]);
> +  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
> +
> +  emit_insn (gen_vec_load_lanesci_lane<VQ:mode> (operands[0],
> +						  mem,
> +						  operands[2],
> +						  operands[3]));
> +  DONE;
> +})
Likewise.

> +
> +(define_expand "aarch64_ld4_lane<VQ:mode>"
> +  [(match_operand:XI 0 "register_operand" "=w")
> +	(match_operand:DI 1 "register_operand" "w")
> +	(match_operand:XI 2 "register_operand" "0")
> +	(match_operand:SI 3 "immediate_operand" "i")
> +	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_SIMD"
> +{
> +  enum machine_mode mode = <V_FOUR_ELEM>mode;
> +  rtx mem = gen_rtx_MEM (mode, operands[1]);
> +  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
> +
> +  emit_insn (gen_vec_load_lanesxi_lane<VQ:mode> (operands[0],
> +						  mem,
> +						  operands[2],
> +						  operands[3]));
> +  DONE;
> +})
> +

Likewise.

> +
>   ;; Expanders for builtins to extract vector registers from large
>   ;; opaque integer modes.
>   
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c60038a..ea924ab 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -92,6 +92,9 @@
>       UNSPEC_LD2
>       UNSPEC_LD3
>       UNSPEC_LD4
> +    UNSPEC_LD2_LANE
> +    UNSPEC_LD3_LANE
> +    UNSPEC_LD4_LANE
>       UNSPEC_MB
>       UNSPEC_NOP
>       UNSPEC_PRLG_STK


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics
  2014-09-18 19:41 ` [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics Charles Baylis
  2014-09-19  8:40   ` Kyrill Tkachov
@ 2014-09-19 10:46   ` Tejas Belagod
  2014-09-24 16:36     ` Charles Baylis
  1 sibling, 1 reply; 12+ messages in thread
From: Tejas Belagod @ 2014-09-19 10:46 UTC (permalink / raw)
  To: Charles Baylis, Marcus Shawcroft, Richard Earnshaw, gcc-patches

>
> +(define_expand "aarch64_ld2_lane<VQ:mode>"
> +  [(match_operand:OI 0 "register_operand" "=w")
> +	(match_operand:DI 1 "register_operand" "w")
> +	(match_operand:OI 2 "register_operand" "0")
> +	(match_operand:SI 3 "immediate_operand" "i")
> +	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_SIMD"
> +{
> +  enum machine_mode mode = <V_TWO_ELEM>mode;
> +  rtx mem = gen_rtx_MEM (mode, operands[1]);
> +  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
> +

The endianess lane correction breaks this for BE.

You don't need the endianess lane correction here - we always call neon 
intrinsics with the architectural lane number - irrespective of 
endianness. Unless ofcourse you flip it somewhere to make it a part of 
RTL vec_select lane patterns, which you don't here.

You could also do some lane-bounds checking here in the expander.

> +  emit_insn (gen_vec_load_lanesoi_lane<VQ:mode> (operands[0],
> +						  mem,
> +						  operands[2],
> +						  operands[3]));
> +  DONE;
> +})
> +
> +(define_expand "aarch64_ld3_lane<VQ:mode>"
> +  [(match_operand:CI 0 "register_operand" "=w")
> +	(match_operand:DI 1 "register_operand" "w")
> +	(match_operand:CI 2 "register_operand" "0")
> +	(match_operand:SI 3 "immediate_operand" "i")
> +	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_SIMD"
> +{
> +  enum machine_mode mode = <V_THREE_ELEM>mode;
> +  rtx mem = gen_rtx_MEM (mode, operands[1]);
> +  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
> +

No endianness correction for lanes necessary.

> +  emit_insn (gen_vec_load_lanesci_lane<VQ:mode> (operands[0],
> +						  mem,
> +						  operands[2],
> +						  operands[3]));
> +  DONE;
> +})
> +
> +(define_expand "aarch64_ld4_lane<VQ:mode>"
> +  [(match_operand:XI 0 "register_operand" "=w")
> +	(match_operand:DI 1 "register_operand" "w")
> +	(match_operand:XI 2 "register_operand" "0")
> +	(match_operand:SI 3 "immediate_operand" "i")
> +	(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_SIMD"
> +{
> +  enum machine_mode mode = <V_FOUR_ELEM>mode;
> +  rtx mem = gen_rtx_MEM (mode, operands[1]);
> +  operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
> +

Same.

> +  emit_insn (gen_vec_load_lanesxi_lane<VQ:mode> (operands[0],
> +						  mem,
> +						  operands[2],
> +						  operands[3]));
> +  DONE;
> +})
> +
> +
> +
>   ;; Expanders for builtins to extract vector registers from large
>   ;; opaque integer modes.
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c60038a..ea924ab 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -92,6 +92,9 @@
>       UNSPEC_LD2
>       UNSPEC_LD3
>       UNSPEC_LD4
> +    UNSPEC_LD2_LANE
> +    UNSPEC_LD3_LANE
> +    UNSPEC_LD4_LANE
>       UNSPEC_MB
>       UNSPEC_NOP
>       UNSPEC_PRLG_STK
>


Thanks,
Tejas.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*
  2014-09-18 19:40 ` [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_* Charles Baylis
@ 2014-09-19 11:21   ` Tejas Belagod
  2014-09-26  1:16     ` Charles Baylis
  0 siblings, 1 reply; 12+ messages in thread
From: Tejas Belagod @ 2014-09-19 11:21 UTC (permalink / raw)
  To: Charles Baylis, Marcus Shawcroft, Richard Earnshaw, gcc-patches

On 18/09/14 20:38, Charles Baylis wrote:
> This patch replaces the inline assembler implementations of the
> vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
> functions added in patch 1.
>
> Tested (with the rest of the patch series) with make check on aarch64-oe-linux
> with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
>
> <DATE>  Charles Baylis  <charles.baylis@linaro.org>
>
>          * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
>          update uses to use new macro arguments.
>          (__LD3_LANE_FUNC): Likewise.
>          (__LD4_LANE_FUNC): Likewise.
>
> Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
> ---
>   gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++--------------
>   1 file changed, 237 insertions(+), 122 deletions(-)
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index e62c783..c1fcb47 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
>   __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
>   __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
>
> -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"    \
> -            "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t"   \
> -            "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t"     \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17");                                 \
> -    return result;                                                     \
> -  }
> -
> -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
> -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
> -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
> -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
> -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
> -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
> -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
> -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
> -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
> -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
> -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
> -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
> -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
> -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
> -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
> -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
> -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
> -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
> -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
> -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
> -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
> -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
> -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
> -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
> +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_oi __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o =        __builtin_aarch64_ld2_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);         \
> +  return __b;                                                             \
> +}
> +
> +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD2_LANE_FUNC
> +#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_oi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld2_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +
> +__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
> +__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
> +__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
> +__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
> +__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
> +__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
> +__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
> +__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
> +__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
> +__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
> +__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
> +__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
>
>   #define __LD3R_FUNC(rettype, structtype, ptrtype,                      \
>                      regsuffix, funcsuffix, Q)                           \
> @@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
>   __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
>   __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
>
> -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"   \
> -            "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t"  \
> -            "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t"    \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17", "v18");                          \
> -    return result;                                                     \
> -  }
> -
> -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
> -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
> -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
> -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
> -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
> -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
> -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
> -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
> -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
> -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
> -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
> -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
> -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
> -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
> -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
> -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
> -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
> -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
> -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
> -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
> -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
> -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
> -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
> -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
> +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_ci __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __temp.val[2] =                                                         \
> +    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[2],     \
> +                                          2);                             \
> +  __o =        __builtin_aarch64_ld3_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);         \
> +  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);         \
> +  return __b;                                                             \
> +}
> +
> +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD3_LANE_FUNC
> +#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_xi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld4_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +
> +__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
> +__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
> +__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
> +__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
> +__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
> +__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
> +__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
> +__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
> +__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
> +__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
> +__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
> +__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
>
>   #define __LD4R_FUNC(rettype, structtype, ptrtype,                      \
>                      regsuffix, funcsuffix, Q)                           \
> @@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
>   __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
>   __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
>
> -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"   \
> -            "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t"  \
> -            "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t"    \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17", "v18", "v19");                   \
> -    return result;                                                     \
> -  }
>
> -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
> -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
> -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
> -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
> -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
> -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
> -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
> -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
> -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
> -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
> -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
> -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
> -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
> -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
> -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
> -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
> -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
> -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
> -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
> -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
> -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
> -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
> -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
> -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
> +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_xi __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __temp.val[2] =                                                         \
> +    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
> +  __temp.val[3] =                                                         \
> +    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[2],     \
> +                                          2);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[3],     \
> +                                          3);                             \
> +  __o =        __builtin_aarch64_ld4_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);         \
> +  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);         \
> +  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);         \
> +  return __b;                                                             \
> +}
> +
> +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD4_LANE_FUNC
> +#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_xi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld4_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +

The reason we avoided using type-punning using unions was that reload 
would get confused with potential subreg(mem) that could be introduced 
because of memory xfer caused by unions and large int modes. As a 
result, we would get incorrect or sub-optimal code. But this seems to 
have fixed itself. :-)

Because this involves xfers between large int modes and 
CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test 
what impact your patch has with C_C_M_C removed, so that it will be 
easier to fix the fallout once we remove C_C_M_C eventually. To test 
this you will need Richard's patch set 
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.

Same for your other 2 patches in this series(3,4).

Thanks,
Tejas.

> +__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
> +__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
> +__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
> +__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
> +__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
> +__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
> +__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
> +__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
> +__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
> +__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
> +__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
> +__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
>
>   #define __ST2_LANE_FUNC(intype, largetype, ptrtype,                         \
>                          mode, ptr_mode, funcsuffix, signedtype)              \
> --
> 1.9.1
>
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics
  2014-09-19 10:46   ` Tejas Belagod
@ 2014-09-24 16:36     ` Charles Baylis
  0 siblings, 0 replies; 12+ messages in thread
From: Charles Baylis @ 2014-09-24 16:36 UTC (permalink / raw)
  To: Tejas Belagod, Kyrylo Tkachov
  Cc: Marcus Shawcroft, Richard Earnshaw, gcc-patches

Kyril, Tejas,

Thanks for the review. I agree with all points and will respin v2 accordingly

Charles

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*
  2014-09-19 11:21   ` Tejas Belagod
@ 2014-09-26  1:16     ` Charles Baylis
  2014-09-26 12:47       ` Tejas Belagod
  0 siblings, 1 reply; 12+ messages in thread
From: Charles Baylis @ 2014-09-26  1:16 UTC (permalink / raw)
  To: Tejas Belagod; +Cc: Marcus Shawcroft, Richard Earnshaw, gcc-patches

On 19 September 2014 12:21, Tejas Belagod <tejas.belagod@arm.com> wrote:
> The reason we avoided using type-punning using unions was that reload would
> get confused with potential subreg(mem) that could be introduced because of
> memory xfer caused by unions and large int modes. As a result, we would get
> incorrect or sub-optimal code. But this seems to have fixed itself. :-)
>
> Because this involves xfers between large int modes and
> CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test
> what impact your patch has with C_C_M_C removed, so that it will be easier
> to fix the fallout once we remove C_C_M_C eventually. To test this you will
> need Richard's patch set
> https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.
>
> Same for your other 2 patches in this series(3,4).

I tried those patches, and altered aarch64_cannot_change_mode_class to
return false for all cases.

However, this does not avoid the unnecessary moves.

Taking a really simple test case:

#include <arm_neon.h>

int32x2x2_t xvld2_s32(int32_t *__a)
{
  int32x2x2_t ret;
  __builtin_aarch64_simd_oi __o;
  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
  return ret;
}

(disabling scheduling for clarity)
$ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns
-fno-schedule-insns2
        ...
xvld2_s32:
        ld2     {v2.2s - v3.2s}, [x0]
        orr     v0.8b, v2.8b, v2.8b
        orr     v1.8b, v3.8b, v3.8b
        ret
        ...


The reason is apparent in the rtl dump from ira:
...
      Allocno a0r73 of FP_REGS(32) has 31 avail. regs  33-63, node:
33-63 (confl regs =  0-32 64 65)
...
(insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ])
        (reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64}
     (expr_list:REG_DEAD (reg:DI 0 x0 [ __a ])
        (nil)))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 20 2 (set (reg/v:OI 73 [ __o ])
        (subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [
                            (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
                        ] UNSPEC_LD2)
                    (vec_duplicate:V2SI (const_int 0 [0])))
                (vec_concat:V4SI (unspec:V2SI [
                            (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
                        ] UNSPEC_LD2)
                    (vec_duplicate:V2SI (const_int 0 [0])))) 0))
simd.c:8 2149 {aarch64_ld2v2si_dreg}
     (expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ])
        (nil)))
(insn 20 6 21 2 (set (reg:V2SI 32 v0)
        (subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778
{*aarch64_simd_movv2si}
     (nil))
(insn 21 20 22 2 (set (reg:V2SI 33 v1)
        (subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778
{*aarch64_simd_movv2si}
     (expr_list:REG_DEAD (reg/v:OI 73 [ __o ])
        (nil)))
(insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1
     (nil))
(insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1
     (nil))

The register allocator considers r73 to conflict with v0, because they
are simultaneously live after insn 20. Without the 2nd use of v73 (eg
if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the
allocator does do the right thing with the subreg and allocates v73 to
{v0,v1}.

I haven't read all of the old threads relating to Richard's patches
yet, but I don't see why they would affect this issue.

I don't think the register allocator is able to resolve this unless
the conversion between the __builtin_simd type and the int32x4x2_t
type is done as a single operation.

However, type-punning is not possible with the arrays of 64 bit
vectors, as the arrays are not the same size as the corresponding
__builtin_simd types, and any solution for those would probably help
with the q variants too. Maybe the solution is to pass the NEON
intrinsic types directly to the builtins? Is there a reason that it
wasn't done that way before?

Thanks
Charles

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*
  2014-09-26  1:16     ` Charles Baylis
@ 2014-09-26 12:47       ` Tejas Belagod
  2014-10-08 18:47         ` Charles Baylis
  0 siblings, 1 reply; 12+ messages in thread
From: Tejas Belagod @ 2014-09-26 12:47 UTC (permalink / raw)
  To: Charles Baylis; +Cc: Marcus Shawcroft, Richard Earnshaw, gcc-patches

On 26/09/14 02:16, Charles Baylis wrote:
> On 19 September 2014 12:21, Tejas Belagod <tejas.belagod@arm.com> wrote:
>> The reason we avoided using type-punning using unions was that reload would
>> get confused with potential subreg(mem) that could be introduced because of
>> memory xfer caused by unions and large int modes. As a result, we would get
>> incorrect or sub-optimal code. But this seems to have fixed itself. :-)
>>
>> Because this involves xfers between large int modes and
>> CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test
>> what impact your patch has with C_C_M_C removed, so that it will be easier
>> to fix the fallout once we remove C_C_M_C eventually. To test this you will
>> need Richard's patch set
>> https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.
>>
>> Same for your other 2 patches in this series(3,4).
>
> I tried those patches, and altered aarch64_cannot_change_mode_class to
> return false for all cases.
>
> However, this does not avoid the unnecessary moves.
>
> Taking a really simple test case:
>
> #include <arm_neon.h>
>
> int32x2x2_t xvld2_s32(int32_t *__a)
> {
>    int32x2x2_t ret;
>    __builtin_aarch64_simd_oi __o;
>    __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
>    ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
>    ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
>    return ret;
> }
>
> (disabling scheduling for clarity)
> $ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns
> -fno-schedule-insns2
>          ...
> xvld2_s32:
>          ld2     {v2.2s - v3.2s}, [x0]
>          orr     v0.8b, v2.8b, v2.8b
>          orr     v1.8b, v3.8b, v3.8b
>          ret
>          ...
>
>
> The reason is apparent in the rtl dump from ira:
> ...
>        Allocno a0r73 of FP_REGS(32) has 31 avail. regs  33-63, node:
> 33-63 (confl regs =  0-32 64 65)
> ...
> (insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ])
>          (reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64}
>       (expr_list:REG_DEAD (reg:DI 0 x0 [ __a ])
>          (nil)))
> (note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
> (insn 6 3 20 2 (set (reg/v:OI 73 [ __o ])
>          (subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [
>                              (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
>                          ] UNSPEC_LD2)
>                      (vec_duplicate:V2SI (const_int 0 [0])))
>                  (vec_concat:V4SI (unspec:V2SI [
>                              (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
>                          ] UNSPEC_LD2)
>                      (vec_duplicate:V2SI (const_int 0 [0])))) 0))
> simd.c:8 2149 {aarch64_ld2v2si_dreg}
>       (expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ])
>          (nil)))
> (insn 20 6 21 2 (set (reg:V2SI 32 v0)
>          (subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778
> {*aarch64_simd_movv2si}
>       (nil))
> (insn 21 20 22 2 (set (reg:V2SI 33 v1)
>          (subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778
> {*aarch64_simd_movv2si}
>       (expr_list:REG_DEAD (reg/v:OI 73 [ __o ])
>          (nil)))
> (insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1
>       (nil))
> (insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1
>       (nil))
>
> The register allocator considers r73 to conflict with v0, because they
> are simultaneously live after insn 20. Without the 2nd use of v73 (eg
> if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the
> allocator does do the right thing with the subreg and allocates v73 to
> {v0,v1}.
>
> I haven't read all of the old threads relating to Richard's patches
> yet, but I don't see why they would affect this issue.
>
> I don't think the register allocator is able to resolve this unless
> the conversion between the __builtin_simd type and the int32x4x2_t
> type is done as a single operation.
>

For this piece of code,

#include "arm_neon.h"

int32x2x2_t xvld2_s32(int32_t *__a)
{
   union { int32x2x2_t __i;
          __builtin_aarch64_simd_oi __o; } __temp;
   __temp.__o = __builtin_aarch64_ld2v2si ((const 
__builtin_aarch64_simd_si *) __a);
   return __temp.__i;
}

int32x2x2_t yvld2_s32(int32_t *__a)
{
   int32x2x2_t ret;
   __builtin_aarch64_simd_oi __o;
   __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) 
__a);
   ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
   ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
   return ret;
}

currently my gcc HEAD generates at -O3:

xvld2_s32:
	ld2	{v0.2s - v1.2s}, [x0]
	sub	sp, sp, #64
	st1	{v0.16b - v1.16b}, [sp]
	ldr	x1, [sp]
	ldr	x0, [sp, 8]
	add	sp, sp, 64
	ins	v0.d[0], x1
	ins	v1.d[0], x0
	ret
         ....
yvld2_s32:
	ld2	{v2.2s - v3.2s}, [x0]
	orr	v1.8b, v3.8b, v3.8b
	orr	v0.8b, v2.8b, v2.8b
	ret

If we use type-punning, there are unnecessary spills that are generated 
which is also incorrect for BE because of of the way we spill (st1 
{v0.16b - v1.16b}, [sp]) and restore. The implementation without 
type-punning seems to give a more optimal result. Did your patches 
improve on the spills for the type-punning solution?

> However, type-punning is not possible with the arrays of 64 bit
> vectors, as the arrays are not the same size as the corresponding
> __builtin_simd types, and any solution for those would probably help
> with the q variants too.

That is because we fill a zero-extended D-reg value into a 128-bit reg 
and pack them into an large int mode(eg. OI). We don't have large int 
modes made up of purely D-regs because we run into ambiguities like 4 
D-regs is an OImode and 2 Q-regs is also an OImode.

> Maybe the solution is to pass the NEON
> intrinsic types directly to the builtins? Is there a reason that it
> wasn't done that way before?
>

How do you mean? Do you mean pass a loaded value int32x2x2_t into a 
__builtin? How will that work?

If you mean why we don't pass an int32x2x2_t into a builtin as a 
structure, I don't think that would work as it is struct type which 
would correspond to a  BLK mode, but we need RTL patterns with reg-lists 
to work with large int modes for the regalloc to allocate consecutive 
regs for the reglists.

Thanks,
Tejas.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*
  2014-09-26 12:47       ` Tejas Belagod
@ 2014-10-08 18:47         ` Charles Baylis
  0 siblings, 0 replies; 12+ messages in thread
From: Charles Baylis @ 2014-10-08 18:47 UTC (permalink / raw)
  To: Tejas Belagod; +Cc: Marcus Shawcroft, Richard Earnshaw, gcc-patches

On 26 September 2014 13:47, Tejas Belagod <tejas.belagod@arm.com> wrote:
> If we use type-punning, there are unnecessary spills that are generated
> which is also incorrect for BE because of of the way we spill (st1 {v0.16b -
> v1.16b}, [sp]) and restore. The implementation without type-punning seems to
> give a more optimal result. Did your patches improve on the spills for the
> type-punning solution?

OK, this part seems too contentious, so I've respun the vldN_lane
parts without the type punning and reposted them. This issue can be
resolved separately.

Trying an example like this gives good code with type punning, and
poor code without.

void t2(int32_t *p)
{
    int32x4x4_t va = vld4q_s32(p);
    va = vld4q_lane_s32(p + 500, va, 1);
    vst4q_s32(p+1000, va);
}

With type-punning, good code:
t2:
        ld4     {v0.4s - v3.4s}, [x0]
        add     x2, x0, 2000
        add     x1, x0, 4000
        ld4     {v0.s - v3.s}[1], [x2]
        st4     {v0.4s - v3.4s}, [x1]
        ret

Without type-punning, horrible code:
t2:
        ld4     {v0.4s - v3.4s}, [x0]
        sub     sp, sp, #64
        add     x14, x0, 2000
        add     x0, x0, 4000
        umov    x12, v0.d[0]
        umov    x13, v0.d[1]
        umov    x10, v1.d[0]
        umov    x11, v1.d[1]
        umov    x8, v2.d[0]
        str     x12, [sp]
        umov    x9, v2.d[1]
        str     x13, [sp, 8]
        str     q3, [sp, 48]
        str     x10, [sp, 16]
        str     x11, [sp, 24]
        str     x8, [sp, 32]
        str     x9, [sp, 40]
        ld1     {v0.16b - v3.16b}, [sp]
        ld4     {v0.s - v3.s}[1], [x14]
        umov    x10, v0.d[0]
        umov    x11, v0.d[1]
        umov    x8, v1.d[0]
        umov    x9, v1.d[1]
        umov    x6, v2.d[0]
        str     x10, [sp]
        umov    x7, v2.d[1]
        str     x11, [sp, 8]
        str     q3, [sp, 48]
        str     x8, [sp, 16]
        str     x9, [sp, 24]
        str     x6, [sp, 32]
        str     x7, [sp, 40]
        ld1     {v0.16b - v3.16b}, [sp]
        add     sp, sp, 64
        st4     {v0.4s - v3.4s}, [x0]
        ret

>> Maybe the solution is to pass the NEON
>> intrinsic types directly to the builtins? Is there a reason that it
>> wasn't done that way before?
>
> How do you mean? Do you mean pass a loaded value int32x2x2_t into a
> __builtin? How will that work?
>
> If you mean why we don't pass an int32x2x2_t into a builtin as a structure,
> I don't think that would work as it is struct type which would correspond to
> a  BLK mode, but we need RTL patterns with reg-lists to work with large int
> modes for the regalloc to allocate consecutive regs for the reglists.

OK, that makes sense. However, something needs to be done to create
the __arch64_simd_ objects without register moves. Since the existing
mechanism causes problems because the lifetimes of the inputs overlap
with the lifetimes of the outputs, I think there are these options:

1. represent the construction/deconstruction as a single operation, to
avoid overlapping variable liveness in the source.
2. add a pass or peephole which can combine the existing builtins into
a single operation, so that the lifetimes are normalised.
3. teach the register allocator how to handle overlapping liveness of
a register and a subreg of that register.

Option 1 would require a new builtin interface which somehow handled a
whole int32x2x2_t in one operation. Construction is easy
(__builtin_aarch64_simd_construct(v.val[0], v.val[1]) or similar).
Deconstruction is less obvious

Option 2 sounds like a hack, but would probably be effective,
particularly if it can be done before inlining.

Option 3 would also help with poor code generation for ARM targets
with vget_low_*, vget_high_* and vcombine_*.

What do you think is the best approach?

Thanks
Charles

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-10-08 18:47 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-18 19:40 [PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics Charles Baylis
2014-09-18 19:40 ` [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_* Charles Baylis
2014-09-19 11:21   ` Tejas Belagod
2014-09-26  1:16     ` Charles Baylis
2014-09-26 12:47       ` Tejas Belagod
2014-10-08 18:47         ` Charles Baylis
2014-09-18 19:41 ` [PATCH 3/4] [AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics Charles Baylis
2014-09-18 19:41 ` [PATCH 4/4] [AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics Charles Baylis
2014-09-18 19:41 ` [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics Charles Baylis
2014-09-19  8:40   ` Kyrill Tkachov
2014-09-19 10:46   ` Tejas Belagod
2014-09-24 16:36     ` Charles Baylis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).