From: Tejas Belagod <tejas.belagod@arm.com>
To: "charles.baylis@linaro.org" <charles.baylis@linaro.org>,
Richard Earnshaw <Richard.Earnshaw@arm.com>,
"gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>,
Marcus Shawcroft <Marcus.Shawcroft@arm.com>,
Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: Re: [PATCH 2/2] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*
Date: Thu, 09 Oct 2014 15:14:00 -0000 [thread overview]
Message-ID: <5436A51A.6050307@arm.com> (raw)
In-Reply-To: <1412789236-26461-3-git-send-email-charles.baylis@linaro.org>
On 08/10/14 18:27, charles.baylis@linaro.org wrote:
> From: Charles Baylis <charles.baylis@linaro.org>
>
> This patch replaces the inline assembler implementations of the
> vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
> functions added in patch 1.
>
> Tested (with the rest of the patch series) with make check on aarch64-oe-linux
> with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
>
> <DATE> Charles Baylis <charles.baylis@linaro.org>
>
> * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
> update uses to use new macro arguments.
> (__LD3_LANE_FUNC): Likewise.
> (__LD4_LANE_FUNC): Likewise.
>
> Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
> ---
> gcc/config/aarch64/arm_neon.h | 377 ++++++++++++++++++++++++++++--------------
> 1 file changed, 255 insertions(+), 122 deletions(-)
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 9b1873f..19ce261 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -11805,47 +11805,83 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
> __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
> __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
>
> -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \
> - lnsuffix, funcsuffix, Q) \
> - __extension__ static __inline rettype \
> - __attribute__ ((__always_inline__)) \
> - vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
> - rettype b, const int c) \
> - { \
> - rettype result; \
> - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
> - "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \
> - "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
> - : "=Q"(result) \
> - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
> - : "memory", "v16", "v17"); \
> - return result; \
> - }
> -
> -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
> -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
> -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
> -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
> -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
> -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
> -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
> -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
> -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
> -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
> -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
> -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
> -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
> -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
> -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
> -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
> -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
> -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
> -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
> -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
> -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
> -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
> -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
> -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
> +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \
> + mode, ptrmode, funcsuffix, signedtype) \
> +__extension__ static __inline intype __attribute__ ((__always_inline__)) \
> +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{ \
> + __builtin_aarch64_simd_oi __o; \
> + largetype __temp; \
> + __temp.val[0] = \
> + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
> + __temp.val[1] = \
> + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
> + __o = __builtin_aarch64_set_qregoi##mode (__o, \
> + (signedtype) __temp.val[0], \
> + 0); \
> + __o = __builtin_aarch64_set_qregoi##mode (__o, \
> + (signedtype) __temp.val[1], \
> + 1); \
> + __o = __builtin_aarch64_ld2_lane##mode ( \
> + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
> + __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \
> + __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \
> + return __b; \
> +}
> +
> +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
> + sf, f32, float32x4_t)
> +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
> + df, f64, float64x2_t)
> +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
> + int8x16_t)
> +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
> + p16, int16x8_t)
> +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
> + int8x16_t)
> +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
> + int16x8_t)
> +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
> + int32x4_t)
> +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
> + int64x2_t)
> +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
> + int8x16_t)
> +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
> + u16, int16x8_t)
> +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
> + u32, int32x4_t)
> +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
> + u64, int64x2_t)
> +
> +#undef __LD2_LANE_FUNC
> +#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
> +__extension__ static __inline intype __attribute__ ((__always_inline__)) \
> +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{ \
> + __builtin_aarch64_simd_oi __o; \
> + intype ret; \
> + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
> + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
> + __o = __builtin_aarch64_ld2_lane##mode ( \
> + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
> + ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \
> + ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \
> + return ret; \
> +}
> +
> +__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
> +__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
> +__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
> +__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
> +__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
> +__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
> +__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
> +__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
> +__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
> +__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
> +__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
> +__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
>
> #define __LD3R_FUNC(rettype, structtype, ptrtype, \
> regsuffix, funcsuffix, Q) \
> @@ -11887,47 +11923,91 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
> __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
> __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
>
> -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \
> - lnsuffix, funcsuffix, Q) \
> - __extension__ static __inline rettype \
> - __attribute__ ((__always_inline__)) \
> - vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
> - rettype b, const int c) \
> - { \
> - rettype result; \
> - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \
> - "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \
> - "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \
> - : "=Q"(result) \
> - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
> - : "memory", "v16", "v17", "v18"); \
> - return result; \
> - }
> -
> -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
> -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
> -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
> -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
> -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
> -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
> -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
> -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
> -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
> -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
> -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
> -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
> -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
> -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
> -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
> -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
> -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
> -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
> -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
> -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
> -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
> -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
> -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
> -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
> +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \
> + mode, ptrmode, funcsuffix, signedtype) \
> +__extension__ static __inline intype __attribute__ ((__always_inline__)) \
> +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{ \
> + __builtin_aarch64_simd_ci __o; \
> + largetype __temp; \
> + __temp.val[0] = \
> + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
> + __temp.val[1] = \
> + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
> + __temp.val[2] = \
> + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
> + __o = __builtin_aarch64_set_qregci##mode (__o, \
> + (signedtype) __temp.val[0], \
> + 0); \
> + __o = __builtin_aarch64_set_qregci##mode (__o, \
> + (signedtype) __temp.val[1], \
> + 1); \
> + __o = __builtin_aarch64_set_qregci##mode (__o, \
> + (signedtype) __temp.val[2], \
> + 2); \
> + __o = __builtin_aarch64_ld3_lane##mode ( \
> + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
> + __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \
> + __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \
> + __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \
> + return __b; \
> +}
> +
> +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
> + sf, f32, float32x4_t)
> +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
> + df, f64, float64x2_t)
> +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
> + int8x16_t)
> +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
> + p16, int16x8_t)
> +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
> + int8x16_t)
> +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
> + int16x8_t)
> +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
> + int32x4_t)
> +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
> + int64x2_t)
> +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
> + int8x16_t)
> +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
> + u16, int16x8_t)
> +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
> + u32, int32x4_t)
> +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
> + u64, int64x2_t)
> +
> +#undef __LD3_LANE_FUNC
> +#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
> +__extension__ static __inline intype __attribute__ ((__always_inline__)) \
> +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{ \
> + __builtin_aarch64_simd_ci __o; \
> + intype ret; \
> + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
> + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
> + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
> + __o = __builtin_aarch64_ld3_lane##mode ( \
> + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
> + ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \
> + ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \
> + ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \
> + return ret; \
> +}
> +
> +__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
> +__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
> +__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
> +__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
> +__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
> +__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
> +__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
> +__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
> +__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
> +__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
> +__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
> +__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
>
> #define __LD4R_FUNC(rettype, structtype, ptrtype, \
> regsuffix, funcsuffix, Q) \
> @@ -11969,47 +12049,100 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
> __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
> __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
>
> -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \
> - lnsuffix, funcsuffix, Q) \
> - __extension__ static __inline rettype \
> - __attribute__ ((__always_inline__)) \
> - vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
> - rettype b, const int c) \
> - { \
> - rettype result; \
> - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \
> - "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \
> - "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \
> - : "=Q"(result) \
> - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
> - : "memory", "v16", "v17", "v18", "v19"); \
> - return result; \
> - }
>
> -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
> -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
> -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
> -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
> -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
> -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
> -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
> -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
> -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
> -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
> -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
> -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
> -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
> -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
> -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
> -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
> -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
> -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
> -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
> -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
> -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
> -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
> -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
> -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
> +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \
> + mode, ptrmode, funcsuffix, signedtype) \
> +__extension__ static __inline intype __attribute__ ((__always_inline__)) \
> +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{ \
> + __builtin_aarch64_simd_xi __o; \
> + largetype __temp; \
> + __temp.val[0] = \
> + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
> + __temp.val[1] = \
> + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
> + __temp.val[2] = \
> + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
> + __temp.val[3] = \
> + vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \
> + __o = __builtin_aarch64_set_qregxi##mode (__o, \
> + (signedtype) __temp.val[0], \
> + 0); \
> + __o = __builtin_aarch64_set_qregxi##mode (__o, \
> + (signedtype) __temp.val[1], \
> + 1); \
> + __o = __builtin_aarch64_set_qregxi##mode (__o, \
> + (signedtype) __temp.val[2], \
> + 2); \
> + __o = __builtin_aarch64_set_qregxi##mode (__o, \
> + (signedtype) __temp.val[3], \
> + 3); \
> + __o = __builtin_aarch64_ld4_lane##mode ( \
> + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
> + __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \
> + __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \
> + __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \
> + __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \
> + return __b; \
> +}
> +
> +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
> + sf, f32, float32x4_t)
> +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
> + df, f64, float64x2_t)
> +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
> + int8x16_t)
> +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
> + p16, int16x8_t)
> +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
> + int8x16_t)
> +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
> + int16x8_t)
> +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
> + int32x4_t)
> +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
> + int64x2_t)
> +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
> + int8x16_t)
> +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
> + u16, int16x8_t)
> +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
> + u32, int32x4_t)
> +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
> + u64, int64x2_t)
> +
> +#undef __LD4_LANE_FUNC
> +#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
> +__extension__ static __inline intype __attribute__ ((__always_inline__)) \
> +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{ \
> + __builtin_aarch64_simd_xi __o; \
> + intype ret; \
> + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
> + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
> + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
> + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
> + __o = __builtin_aarch64_ld4_lane##mode ( \
> + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
> + ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \
> + ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \
> + ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \
> + ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \
> + return ret; \
> +}
> +
> +__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
> +__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
> +__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
> +__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
> +__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
> +__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
> +__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
> +__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
> +__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
> +__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
> +__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
> +__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
>
> #define __ST2_LANE_FUNC(intype, largetype, ptrtype, \
> mode, ptr_mode, funcsuffix, signedtype) \
> --
> 1.9.1
>
>
LGTM(but I can't approve it). Thanks for this patch.
Tejas.
next prev parent reply other threads:[~2014-10-09 15:09 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-10-08 17:27 [PATCH 0/2] [AARCH64,NEON] Improve vld[234](q?)_lane intrinsics v2 charles.baylis
2014-10-08 17:27 ` [PATCH 2/2] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_* charles.baylis
2014-10-09 15:14 ` Tejas Belagod [this message]
2014-10-23 10:16 ` Marcus Shawcroft
2014-10-23 17:52 ` Charles Baylis
2014-10-24 10:37 ` Marcus Shawcroft
2014-10-24 15:18 ` Charles Baylis
2014-10-08 17:27 ` [PATCH 1/2] [AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_* intrinsics charles.baylis
2014-10-09 15:07 ` Tejas Belagod
2014-10-15 14:38 ` Charles Baylis
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5436A51A.6050307@arm.com \
--to=tejas.belagod@arm.com \
--cc=Kyrylo.Tkachov@arm.com \
--cc=Marcus.Shawcroft@arm.com \
--cc=Richard.Earnshaw@arm.com \
--cc=charles.baylis@linaro.org \
--cc=gcc-patches@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).