From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 23208 invoked by alias); 22 Apr 2015 16:59:24 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Received: (qmail 23167 invoked by uid 89); 22 Apr 2015 16:59:23 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=AWL,BAYES_00,SPF_PASS autolearn=ham version=3.3.2 X-HELO: eu-smtp-delivery-143.mimecast.com Received: from eu-smtp-delivery-143.mimecast.com (HELO eu-smtp-delivery-143.mimecast.com) (146.101.78.143) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Wed, 22 Apr 2015 16:59:21 +0000 Received: from cam-owa1.Emea.Arm.com (fw-tnat.cambridge.arm.com [217.140.96.140]) by uk-mta-5.uk.mimecast.lan; Wed, 22 Apr 2015 17:59:17 +0100 Received: from [10.2.207.65] ([10.1.2.79]) by cam-owa1.Emea.Arm.com with Microsoft SMTPSVC(6.0.3790.3959); Wed, 22 Apr 2015 17:59:17 +0100 Message-ID: <5537D365.80005@arm.com> Date: Wed, 22 Apr 2015 16:59:00 -0000 From: Alan Lawrence User-Agent: Thunderbird 2.0.0.24 (X11/20101213) MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" Subject: [PATCH 3/14][ARM] Add float16x8_t intrinsics In-Reply-To: <5537D241.1000606@arm.com> X-MC-Unique: pud8XYfiSOiiZ_rkZdcwpg-1 Content-Type: multipart/mixed; boundary="------------010501010600040900050402" X-IsSubscribed: yes X-SW-Source: 2015-04/txt/msg01337.txt.bz2 This is a multi-part message in MIME format. --------------010501010600040900050402 Content-Type: text/plain; charset=WINDOWS-1252; format=flowed Content-Transfer-Encoding: quoted-printable Content-length: 1122 This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01439.htm= l ,=20 again fixing a wrong 'lane index out of bounds' error for vgetq_lane_f16 an= d=20 vsetq_lane-f16 at -O0, and dropping vdupq_n_f16 and vdupq_lane_f16 as these= are=20 not in the ACLE spec. The vld1, vldN, vldN_lane and corresponding intrinsics follow in patch 4/14. Bootstrapped + check-gcc on arm-none-linux-gnueabihf. gcc/ChangeLog: * config/arm/arm_neon.h (vgetq_lane_f16, vsetq_lane_f16, vld1q_lane_f16, vld1q_dup_f16, vreinterpretq_p8_f16, vreinterpretq_p16_f16, vreinterpretq_f16_p8, vreinterpretq_f16_p16, vreinterpretq_f16_f32, vreinterpretq_f16_p64, vreinterpretq_f16_p128, vreinterpretq_f16_s64, vreinterpretq_f16_u64, vreinterpretq_f16_s8, vreinterpretq_f16_s16, vreinterpretq_f16_s32, vreinterpretq_f16_u8, vreinterpretq_f16_u16, vreinterpretq_f16_u32, vreinterpretq_f32_f16, vreinterpretq_p64_f16, vreinterpretq_p128_f16, vreinterpretq_s64_f16, vreinterpretq_u64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16, vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16, vreinterpretq_u32_f16): New. --------------010501010600040900050402 Content-Type: text/x-patch; name=03_arm_f16x8_intrinsics.patch Content-Transfer-Encoding: quoted-printable Content-Disposition: inline; filename="03_arm_f16x8_intrinsics.patch" Content-length: 10378 diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index a958f63ca3084bf7cfaf6420e535d69f50efa6b6..db73c70c6e4ca99db62ff4055a3= 3bfe00db29039 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -5282,6 +5282,15 @@ vgetq_lane_s32 (int32x4_t __a, const int __b) return (int32_t)__builtin_neon_vget_lanev4si (__a, __b); } =20 +#define vgetq_lane_f16(__v, __idx) \ + __extension__ \ + ({ \ + float16x8_t __vec =3D (__v); \ + __builtin_arm_lane_check (8, __idx); \ + float16_t __res =3D __vec[__idx]; \ + __res; \ + }) + __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vgetq_lane_f32 (float32x4_t __a, const int __b) { @@ -5424,6 +5433,16 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const in= t __c) return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a,= __b, __c); } =20 +#define vsetq_lane_f16(__e, __v, __idx) \ + __extension__ \ + ({ \ + float16_t __elem =3D (__e); \ + float16x8_t __vec =3D (__v); \ + __builtin_arm_lane_check (8, __idx); \ + __vec[__idx] =3D __elem; \ + __vec; \ + }) + __extension__ static __inline float32x4_t __attribute__ ((__always_inline_= _)) vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c) { @@ -8907,6 +8926,12 @@ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, = const int __c) return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si= *) __a, __b, __c); } =20 +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c) +{ + return vsetq_lane_f16 (*__a, __b, __c); +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline_= _)) vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c) { @@ -9062,6 +9087,13 @@ vld1q_dup_s32 (const int32_t * __a) return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si = *) __a); } =20 +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vld1q_dup_f16 (const float16_t * __a) +{ + float16_t __f =3D *__a; + return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f }; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline_= _)) vld1q_dup_f32 (const float32_t * __a) { @@ -12856,6 +12888,12 @@ vreinterpretq_p8_p16 (poly16x8_t __a) } =20 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__= )) +vreinterpretq_p8_f16 (float16x8_t __a) +{ + return (poly8x16_t) __a; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__= )) vreinterpretq_p8_f32 (float32x4_t __a) { return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); @@ -12932,6 +12970,12 @@ vreinterpretq_p16_p8 (poly8x16_t __a) } =20 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__= )) +vreinterpretq_p16_f16 (float16x8_t __a) +{ + return (poly16x8_t) __a; +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__= )) vreinterpretq_p16_f32 (float32x4_t __a) { return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); @@ -13001,6 +13045,88 @@ vreinterpretq_p16_u32 (uint32x4_t __a) return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a); } =20 +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_p8 (poly8x16_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_p16 (poly16x8_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_f32 (float32x4_t __a) +{ + return (float16x8_t) __a; +} + +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_p64 (poly64x2_t __a) +{ + return (float16x8_t) __a; +} + +#endif +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_p128 (poly128_t __a) +{ + return (float16x8_t) __a; +} + +#endif +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_s64 (int64x2_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_u64 (uint64x2_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_s8 (int8x16_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_s16 (int16x8_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_s32 (int32x4_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_u8 (uint8x16_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_u16 (uint16x8_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f16_u32 (uint32x4_t __a) +{ + return (float16x8_t) __a; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline_= _)) vreinterpretq_f32_p8 (poly8x16_t __a) { @@ -13013,6 +13139,12 @@ vreinterpretq_f32_p16 (poly16x8_t __a) return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a= ); } =20 +__extension__ static __inline float32x4_t __attribute__ ((__always_inline_= _)) +vreinterpretq_f32_f16 (float16x8_t __a) +{ + return (float32x4_t) __a; +} + #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline float32x4_t __attribute__ ((__always_inline_= _)) vreinterpretq_f32_p64 (poly64x2_t __a) @@ -13095,6 +13227,14 @@ vreinterpretq_p64_p16 (poly16x8_t __a) #endif #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline poly64x2_t __attribute__ ((__always_inline__= )) +vreinterpretq_p64_f16 (float16x8_t __a) +{ + return (poly64x2_t) __a; +} + +#endif +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__= )) vreinterpretq_p64_f32 (float32x4_t __a) { return (poly64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); @@ -13191,6 +13331,14 @@ vreinterpretq_p128_p16 (poly16x8_t __a) #endif #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline poly128_t __attribute__ ((__always_inline__)) +vreinterpretq_p128_f16 (float16x8_t __a) +{ + return (poly128_t) __a; +} + +#endif +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline poly128_t __attribute__ ((__always_inline__)) vreinterpretq_p128_f32 (float32x4_t __a) { return (poly128_t)__builtin_neon_vreinterprettiv4sf (__a); @@ -13282,6 +13430,12 @@ vreinterpretq_s64_p16 (poly16x8_t __a) } =20 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_f16 (float16x8_t __a) +{ + return (int64x2_t) __a; +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) vreinterpretq_s64_f32 (float32x4_t __a) { return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); @@ -13358,6 +13512,12 @@ vreinterpretq_u64_p16 (poly16x8_t __a) } =20 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__= )) +vreinterpretq_u64_f16 (float16x8_t __a) +{ + return (uint64x2_t) __a; +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__= )) vreinterpretq_u64_f32 (float32x4_t __a) { return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); @@ -13434,6 +13594,12 @@ vreinterpretq_s8_p16 (poly16x8_t __a) } =20 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_f16 (float16x8_t __a) +{ + return (int8x16_t) __a; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) vreinterpretq_s8_f32 (float32x4_t __a) { return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); @@ -13510,6 +13676,12 @@ vreinterpretq_s16_p16 (poly16x8_t __a) } =20 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_f16 (float16x8_t __a) +{ + return (int16x8_t) __a; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) vreinterpretq_s16_f32 (float32x4_t __a) { return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); @@ -13586,6 +13758,12 @@ vreinterpretq_s32_p16 (poly16x8_t __a) } =20 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_f16 (float16x8_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) vreinterpretq_s32_f32 (float32x4_t __a) { return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a); @@ -13662,6 +13840,12 @@ vreinterpretq_u8_p16 (poly16x8_t __a) } =20 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__= )) +vreinterpretq_u8_f16 (float16x8_t __a) +{ + return (uint8x16_t) __a; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__= )) vreinterpretq_u8_f32 (float32x4_t __a) { return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); @@ -13738,6 +13922,12 @@ vreinterpretq_u16_p16 (poly16x8_t __a) } =20 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__= )) +vreinterpretq_u16_f16 (float16x8_t __a) +{ + return (uint16x8_t) __a; +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__= )) vreinterpretq_u16_f32 (float32x4_t __a) { return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); @@ -13814,6 +14004,12 @@ vreinterpretq_u32_p16 (poly16x8_t __a) } =20 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__= )) +vreinterpretq_u32_f16 (float16x8_t __a) +{ + return (uint32x4_t) __a; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__= )) vreinterpretq_u32_f32 (float32x4_t __a) { return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a); --------------010501010600040900050402--