public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Charles Baylis <charles.baylis@linaro.org>
To: marcus.shawcroft@arm.com,	rearnsha@arm.com,	gcc-patches@gcc.gnu.org
Subject: [PATCH 3/4] [AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics
Date: Thu, 18 Sep 2014 19:41:00 -0000	[thread overview]
Message-ID: <1411069109-31425-4-git-send-email-charles.baylis@linaro.org> (raw)
In-Reply-To: <1411069109-31425-1-git-send-email-charles.baylis@linaro.org>

This patch improves code generation of vld[234]q_* intrinsics by avoiding use
of the __builtin_aarch64_get_qreg_* builtins to generate a temporary result
variable. Instead, a union is used for type-punning, which avoids generation of
some unnecessary move instructions. This idiom is already used in several other
intrinsics.

This patch is independent of the previous patches in the series.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (vld2q_s8, vld2q_p8, vld2q_s16, vld2q_p16,
	vld2q_s32, vld2q_s64, vld2q_u8, vld2q_u16, vld2q_u32, vld2q_u64,
	vld2q_f32, vld2q_f64, vld3q_s8, vld3q_p8, vld3q_s16, vld3q_p16,
	vld3q_s32, vld3q_s64, vld3q_u8, vld3q_u16, vld3q_u32, vld3q_u64,
	vld3q_f32, vld3q_f64, vld4q_s8, vld4q_p8, vld4q_s16, vld4q_p16,
	vld4q_s32, vld4q_s64, vld4q_u8, vld4q_u16, vld4q_u32, vld4q_u64,
	vld4q_f32, vld4q_f64): Use type-punning to convert between NEON
	intrinsic types and __builtin_aarch64_simd* types.

Change-Id: I61efa29138b13c7a83679885343211d604a73b15
---
 gcc/config/aarch64/arm_neon.h | 396 +++++++++++++++---------------------------
 1 file changed, 144 insertions(+), 252 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c1fcb47..87e3baf 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -16969,133 +16969,109 @@ vld2_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
 vld2q_s8 (const int8_t * __a)
 {
-  int8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { int8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
 vld2q_p8 (const poly8_t * __a)
 {
-  poly8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { poly8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
 vld2q_s16 (const int16_t * __a)
 {
-  int16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { int16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
 vld2q_p16 (const poly16_t * __a)
 {
-  poly16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { poly16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
 vld2q_s32 (const int32_t * __a)
 {
-  int32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  union { int32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
 vld2q_s64 (const int64_t * __a)
 {
-  int64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  union { int64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
 vld2q_u8 (const uint8_t * __a)
 {
-  uint8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { uint8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
 vld2q_u16 (const uint16_t * __a)
 {
-  uint16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { uint16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
 vld2q_u32 (const uint32_t * __a)
 {
-  uint32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  union { uint32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
 vld2q_u64 (const uint64_t * __a)
 {
-  uint64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  union { uint64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_f32 (const float32_t * __a)
 {
-  float32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
-  return ret;
+  union { float32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
 vld2q_f64 (const float64_t * __a)
 {
-  float64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
-  return ret;
+  union { float64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
@@ -17245,145 +17221,109 @@ vld3_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
 vld3q_s8 (const int8_t * __a)
 {
-  int8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { int8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
 vld3q_p8 (const poly8_t * __a)
 {
-  poly8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { poly8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
 vld3q_s16 (const int16_t * __a)
 {
-  int16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { int16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
 vld3q_p16 (const poly16_t * __a)
 {
-  poly16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { poly16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
 vld3q_s32 (const int32_t * __a)
 {
-  int32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  union { int32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
 vld3q_s64 (const int64_t * __a)
 {
-  int64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  union { int64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
 vld3q_u8 (const uint8_t * __a)
 {
-  uint8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { uint8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
 vld3q_u16 (const uint16_t * __a)
 {
-  uint16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { uint16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
 vld3q_u32 (const uint32_t * __a)
 {
-  uint32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  union { uint32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
 vld3q_u64 (const uint64_t * __a)
 {
-  uint64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  union { uint64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_f32 (const float32_t * __a)
 {
-  float32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
-  return ret;
+  union { float32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
 vld3q_f64 (const float64_t * __a)
 {
-  float64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
-  return ret;
+  union { float64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
@@ -17545,157 +17485,109 @@ vld4_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
 vld4q_s8 (const int8_t * __a)
 {
-  int8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { int8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
 vld4q_p8 (const poly8_t * __a)
 {
-  poly8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { poly8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
 vld4q_s16 (const int16_t * __a)
 {
-  int16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { int16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
 vld4q_p16 (const poly16_t * __a)
 {
-  poly16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { poly16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
 vld4q_s32 (const int32_t * __a)
 {
-  int32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  union { int32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
 vld4q_s64 (const int64_t * __a)
 {
-  int64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  union { int64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
 vld4q_u8 (const uint8_t * __a)
 {
-  uint8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { uint8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
 vld4q_u16 (const uint16_t * __a)
 {
-  uint16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { uint16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
 vld4q_u32 (const uint32_t * __a)
 {
-  uint32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  union { uint32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
 vld4q_u64 (const uint64_t * __a)
 {
-  uint64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  union { uint64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_f32 (const float32_t * __a)
 {
-  float32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
-  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
-  return ret;
+  union { float32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
 vld4q_f64 (const float64_t * __a)
 {
-  float64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
-  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
-  return ret;
+  union { float64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 /* vmax */
-- 
1.9.1

  parent reply	other threads:[~2014-09-18 19:40 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-09-18 19:40 [PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics Charles Baylis
2014-09-18 19:40 ` [PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_* Charles Baylis
2014-09-19 11:21   ` Tejas Belagod
2014-09-26  1:16     ` Charles Baylis
2014-09-26 12:47       ` Tejas Belagod
2014-10-08 18:47         ` Charles Baylis
2014-09-18 19:41 ` Charles Baylis [this message]
2014-09-18 19:41 ` [PATCH 1/4] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics Charles Baylis
2014-09-19  8:40   ` Kyrill Tkachov
2014-09-19 10:46   ` Tejas Belagod
2014-09-24 16:36     ` Charles Baylis
2014-09-18 19:41 ` [PATCH 4/4] [AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics Charles Baylis

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1411069109-31425-4-git-send-email-charles.baylis@linaro.org \
    --to=charles.baylis@linaro.org \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=marcus.shawcroft@arm.com \
    --cc=rearnsha@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).