[PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite
@ 2015-04-22 16:54 Alan Lawrence
  2015-04-22 16:58 ` [PATCH 1/14][ARM] Add float16x4_t intrinsics Alan Lawrence
                   ` (14 more replies)
  0 siblings, 15 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 16:54 UTC (permalink / raw)
  To: gcc-patches

This patch series adds support for ARM Neon float16x4_t and float16x8_t vector 
types and intrinsics, and the __fp16 type, on both ARM and AArch64, and extends 
the tests in Christophe Lyon's advsimd-intrinsics testsuite to cover these. (I 
chose to extend the existing tests rather than add new ones, as the majority of 
f16 intrinsics are just moving blocks of 16-bits around and do not depend on HW 
support; I added new files for the conversion intrinsics.)

The ARM parts were previously posted at 
https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01434.html but have had some fixes 
following the testsuite additions. Also The ARM patches depend upon my ARM 
lane-checking improvements at 
https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01422.html , which I have just pinged.

I've cross-tested baremetal arm-none-eabi, aarch64-none-elf and 
aarch64_be-none-elf most patches individually, and bootstrapped each patch in 
series on (the relevant one of) arm-none-linux-gnueabihf and aarch64-none-linux-gnu.

OK for trunk?

Cheers, Alan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/14][ARM] Add float16x4_t intrinsics
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
@ 2015-04-22 16:58 ` Alan Lawrence
  2015-04-22 16:59 ` [PATCH 2/14][ARM]Add float16x8_t type Alan Lawrence
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 16:58 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1185 bytes --]

This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01437.html , 
but fixes a wrong 'lane index out of bounds' error on vget_lane_f16 and 
vset_lane_f16, and drops vdup_n_f16 and vdup_lane_f16, as these are not in the 
ACLE spec. As previously, these use GCC vector extensions to maximise mid-end 
optimization, and do not attempt to support bigendian.

The vld1, vldN, vldN_lane and corresponding intrinsics follow in patch 4/14.

Bootstrapped + check-gcc on arm-none-linux-gnueabihf.

gcc/ChangeLog:

	* config/arm/arm_neon.h (float16_t, vget_lane_f16, vset_lane_f16,
	vcreate_f16, vld1_lane_f16, vld1_dup_f16, vreinterpret_p8_f16,
	vreinterpret_p16_f16, vreinterpret_f16_p8, vreinterpret_f16_p16,
	vreinterpret_f16_f32, vreinterpret_f16_p64, vreinterpret_f16_s64,
	vreinterpret_f16_u64, vreinterpret_f16_s8, vreinterpret_f16_s16,
	vreinterpret_f16_s32, vreinterpret_f16_u8, vreinterpret_f16_u16,
	vreinterpret_f16_u32, vreinterpret_f32_f16, vreinterpret_p64_f16,
	vreinterpret_s64_f16, vreinterpret_u64_f16, vreinterpret_s8_f16,
	vreinterpret_s16_f16, vreinterpret_s32_f16, vreinterpret_u8_f16,
	vreinterpret_u16_f16, vreinterpret_u32_f16): New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 01_arm_f16x4_intrinsics.patch --]
[-- Type: text/x-patch; name=01_arm_f16x4_intrinsics.patch, Size: 10683 bytes --]

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index c923e294cda2f8cb88e4b1ccca6fd4f13a3ed98d..b4100c88f83bc603377912b7aab085532178ef99 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -41,6 +41,7 @@ typedef __simd64_int8_t int8x8_t;
 typedef __simd64_int16_t int16x4_t;
 typedef __simd64_int32_t int32x2_t;
 typedef __builtin_neon_di int64x1_t;
+typedef __builtin_neon_hf float16_t;
 typedef __simd64_float16_t float16x4_t;
 typedef __simd64_float32_t float32x2_t;
 typedef __simd64_poly8_t poly8x8_t;
@@ -5201,6 +5202,19 @@ vget_lane_s32 (int32x2_t __a, const int __b)
   return (int32_t)__builtin_neon_vget_lanev2si (__a, __b);
 }
 
+/* Functions cannot accept or return __FP16 types.  Even if the function
+   were marked always-inline so there were no call sites, the declaration
+   would nonetheless raise an error.  Hence, we must use a macro instead.  */
+
+#define vget_lane_f16(__v, __idx)		\
+  __extension__					\
+    ({						\
+      float16x4_t __vec = (__v);		\
+      __builtin_arm_lane_check (4, __idx);	\
+      float16_t __res = __vec[__idx];		\
+      __res;					\
+    })
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vget_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -5333,6 +5347,16 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
   return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
 }
 
+#define vset_lane_f16(__e, __v, __idx)		\
+  __extension__					\
+    ({						\
+      float16_t __elem = (__e);			\
+      float16x4_t __vec = (__v);		\
+      __builtin_arm_lane_check (4, __idx);	\
+      __vec[__idx] = __elem;			\
+      __vec;					\
+    })
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
 {
@@ -5479,6 +5503,12 @@ vcreate_s64 (uint64_t __a)
   return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcreate_f16 (uint64_t __a)
+{
+  return (float16x4_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcreate_f32 (uint64_t __a)
 {
@@ -8796,6 +8826,12 @@ vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
   return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_lane_f16 (const float16_t * __a, float16x4_t __b, const int __c)
+{
+  return vset_lane_f16 (*__a, __b, __c);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
 {
@@ -8944,6 +8980,13 @@ vld1_dup_s32 (const int32_t * __a)
   return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_dup_f16 (const float16_t * __a)
+{
+  float16_t __f = *__a;
+  return (float16x4_t) { __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_dup_f32 (const float32_t * __a)
 {
@@ -11828,6 +11871,12 @@ vreinterpret_p8_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_f16 (float16x4_t __a)
+{
+  return (poly8x8_t) __a;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vreinterpret_p8_f32 (float32x2_t __a)
 {
   return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
@@ -11896,6 +11945,12 @@ vreinterpret_p16_p8 (poly8x8_t __a)
 }
 
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_f16 (float16x4_t __a)
+{
+  return (poly16x4_t) __a;
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vreinterpret_p16_f32 (float32x2_t __a)
 {
   return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
@@ -11957,6 +12012,80 @@ vreinterpret_p16_u32 (uint32x2_t __a)
   return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p8 (poly8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p16 (poly16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_f32 (float32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p64 (poly64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+#endif
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s64 (int64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u64 (uint64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s8 (int8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s16 (int16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s32 (int32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u8 (uint8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u16 (uint16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u32 (uint32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vreinterpret_f32_p8 (poly8x8_t __a)
 {
@@ -11969,6 +12098,12 @@ vreinterpret_f32_p16 (poly16x4_t __a)
   return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_f16 (float16x4_t __a)
+{
+  return (float32x2_t) __a;
+}
+
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vreinterpret_f32_p64 (poly64x1_t __a)
@@ -12043,6 +12178,14 @@ vreinterpret_p64_p16 (poly16x4_t __a)
 #endif
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_f16 (float16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
 vreinterpret_p64_f32 (float32x2_t __a)
 {
   return (poly64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
@@ -12126,6 +12269,12 @@ vreinterpret_s64_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_f16 (float16x4_t __a)
+{
+  return (int64x1_t) __a;
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vreinterpret_s64_f32 (float32x2_t __a)
 {
   return (int64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
@@ -12194,6 +12343,12 @@ vreinterpret_u64_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_f16 (float16x4_t __a)
+{
+  return (uint64x1_t) __a;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vreinterpret_u64_f32 (float32x2_t __a)
 {
   return (uint64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
@@ -12262,6 +12417,12 @@ vreinterpret_s8_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_f16 (float16x4_t __a)
+{
+  return (int8x8_t) __a;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vreinterpret_s8_f32 (float32x2_t __a)
 {
   return (int8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
@@ -12330,6 +12491,12 @@ vreinterpret_s16_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_f16 (float16x4_t __a)
+{
+  return (int16x4_t) __a;
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vreinterpret_s16_f32 (float32x2_t __a)
 {
   return (int16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
@@ -12398,6 +12565,12 @@ vreinterpret_s32_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_f16 (float16x4_t __a)
+{
+  return (int32x2_t) __a;
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vreinterpret_s32_f32 (float32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
@@ -12466,6 +12639,12 @@ vreinterpret_u8_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_f16 (float16x4_t __a)
+{
+  return (uint8x8_t) __a;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vreinterpret_u8_f32 (float32x2_t __a)
 {
   return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
@@ -12534,6 +12713,12 @@ vreinterpret_u16_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t) __a;
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vreinterpret_u16_f32 (float32x2_t __a)
 {
   return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
@@ -12602,6 +12787,12 @@ vreinterpret_u32_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_f16 (float16x4_t __a)
+{
+  return (uint32x2_t) __a;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vreinterpret_u32_f32 (float32x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 3/14][ARM] Add float16x8_t intrinsics
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
  2015-04-22 16:58 ` [PATCH 1/14][ARM] Add float16x4_t intrinsics Alan Lawrence
  2015-04-22 16:59 ` [PATCH 2/14][ARM]Add float16x8_t type Alan Lawrence
@ 2015-04-22 16:59 ` Alan Lawrence
  2015-04-22 16:59 ` [PATCH 4/14][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low|high, vcombine Alan Lawrence
                   ` (11 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 16:59 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1132 bytes --]

This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01439.html , 
again fixing a wrong 'lane index out of bounds' error for vgetq_lane_f16 and 
vsetq_lane-f16 at -O0, and dropping vdupq_n_f16 and vdupq_lane_f16 as these are 
not in the ACLE spec.

The vld1, vldN, vldN_lane and corresponding intrinsics follow in patch 4/14.

Bootstrapped + check-gcc on arm-none-linux-gnueabihf.

gcc/ChangeLog:

	* config/arm/arm_neon.h (vgetq_lane_f16, vsetq_lane_f16, vld1q_lane_f16,
	vld1q_dup_f16, vreinterpretq_p8_f16, vreinterpretq_p16_f16,
	vreinterpretq_f16_p8, vreinterpretq_f16_p16, vreinterpretq_f16_f32,
	vreinterpretq_f16_p64, vreinterpretq_f16_p128, vreinterpretq_f16_s64,
	vreinterpretq_f16_u64, vreinterpretq_f16_s8, vreinterpretq_f16_s16,
	vreinterpretq_f16_s32, vreinterpretq_f16_u8, vreinterpretq_f16_u16,
	vreinterpretq_f16_u32, vreinterpretq_f32_f16, vreinterpretq_p64_f16,
	vreinterpretq_p128_f16, vreinterpretq_s64_f16, vreinterpretq_u64_f16,
	vreinterpretq_s8_f16, vreinterpretq_s16_f16, vreinterpretq_s32_f16,
	vreinterpretq_u8_f16, vreinterpretq_u16_f16, vreinterpretq_u32_f16):
	New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 03_arm_f16x8_intrinsics.patch --]
[-- Type: text/x-patch; name=03_arm_f16x8_intrinsics.patch, Size: 10576 bytes --]

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index a958f63ca3084bf7cfaf6420e535d69f50efa6b6..db73c70c6e4ca99db62ff4055a33bfe00db29039 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -5282,6 +5282,15 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
   return (int32_t)__builtin_neon_vget_lanev4si (__a, __b);
 }
 
+#define vgetq_lane_f16(__v, __idx)		\
+  __extension__					\
+    ({						\
+      float16x8_t __vec = (__v);		\
+      __builtin_arm_lane_check (8, __idx);	\
+      float16_t __res = __vec[__idx];		\
+      __res;					\
+    })
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vgetq_lane_f32 (float32x4_t __a, const int __b)
 {
@@ -5424,6 +5433,16 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
   return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
 }
 
+#define vsetq_lane_f16(__e, __v, __idx)		\
+  __extension__					\
+    ({						\
+      float16_t __elem = (__e);			\
+      float16x8_t __vec = (__v);		\
+      __builtin_arm_lane_check (8, __idx);	\
+      __vec[__idx] = __elem;			\
+      __vec;					\
+    })
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
 {
@@ -8907,6 +8926,12 @@ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
   return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c)
+{
+  return vsetq_lane_f16 (*__a, __b, __c);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
 {
@@ -9062,6 +9087,13 @@ vld1q_dup_s32 (const int32_t * __a)
   return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_f16 (const float16_t * __a)
+{
+  float16_t __f = *__a;
+  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t * __a)
 {
@@ -12856,6 +12888,12 @@ vreinterpretq_p8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f16 (float16x8_t __a)
+{
+  return (poly8x16_t) __a;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_p8_f32 (float32x4_t __a)
 {
   return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -12932,6 +12970,12 @@ vreinterpretq_p16_p8 (poly8x16_t __a)
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_f16 (float16x8_t __a)
+{
+  return (poly16x8_t) __a;
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_p16_f32 (float32x4_t __a)
 {
   return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13001,6 +13045,88 @@ vreinterpretq_p16_u32 (uint32x4_t __a)
   return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p8 (poly8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p16 (poly16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_f32 (float32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p64 (poly64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p128 (poly128_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+#endif
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s64 (int64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u64 (uint64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s8 (int8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s16 (int16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s32 (int32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u8 (uint8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u16 (uint16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u32 (uint32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_p8 (poly8x16_t __a)
 {
@@ -13013,6 +13139,12 @@ vreinterpretq_f32_p16 (poly16x8_t __a)
   return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
 }
 
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_f16 (float16x8_t __a)
+{
+  return (float32x4_t) __a;
+}
+
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_p64 (poly64x2_t __a)
@@ -13095,6 +13227,14 @@ vreinterpretq_p64_p16 (poly16x8_t __a)
 #endif
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_f16 (float16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_p64_f32 (float32x4_t __a)
 {
   return (poly64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13191,6 +13331,14 @@ vreinterpretq_p128_p16 (poly16x8_t __a)
 #endif
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+vreinterpretq_p128_f16 (float16x8_t __a)
+{
+  return (poly128_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
 vreinterpretq_p128_f32 (float32x4_t __a)
 {
   return (poly128_t)__builtin_neon_vreinterprettiv4sf (__a);
@@ -13282,6 +13430,12 @@ vreinterpretq_s64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f16 (float16x8_t __a)
+{
+  return (int64x2_t) __a;
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_s64_f32 (float32x4_t __a)
 {
   return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13358,6 +13512,12 @@ vreinterpretq_u64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f16 (float16x8_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_u64_f32 (float32x4_t __a)
 {
   return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13434,6 +13594,12 @@ vreinterpretq_s8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f16 (float16x8_t __a)
+{
+  return (int8x16_t) __a;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_s8_f32 (float32x4_t __a)
 {
   return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -13510,6 +13676,12 @@ vreinterpretq_s16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f16 (float16x8_t __a)
+{
+  return (int16x8_t) __a;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_s16_f32 (float32x4_t __a)
 {
   return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13586,6 +13758,12 @@ vreinterpretq_s32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f16 (float16x8_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
@@ -13662,6 +13840,12 @@ vreinterpretq_u8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f16 (float16x8_t __a)
+{
+  return (uint8x16_t) __a;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_u8_f32 (float32x4_t __a)
 {
   return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -13738,6 +13922,12 @@ vreinterpretq_u16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t) __a;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_u16_f32 (float32x4_t __a)
 {
   return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13814,6 +14004,12 @@ vreinterpretq_u32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f16 (float16x8_t __a)
+{
+  return (uint32x4_t) __a;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 2/14][ARM]Add float16x8_t type
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
  2015-04-22 16:58 ` [PATCH 1/14][ARM] Add float16x4_t intrinsics Alan Lawrence
@ 2015-04-22 16:59 ` Alan Lawrence
  2015-04-22 16:59 ` [PATCH 3/14][ARM] Add float16x8_t intrinsics Alan Lawrence
                   ` (12 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 16:59 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 118 bytes --]

Identical to https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01438.html .

Bootstrapped on arm-none-linux-gnueabihf.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 02_arm_float16x8_t.patch --]
[-- Type: text/x-patch; name=02_arm_float16x8_t.patch, Size: 3231 bytes --]

commit bc582bd6a0ed7c7c91fc834603fc573ed745b1a7
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Mon Dec 8 18:40:24 2014 +0000

    Add float16x8_t + V8HFmode support (regardless of -mfp16-format)

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 3de2be7..2d97023 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -204,6 +204,7 @@ arm_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define di_UP    DImode
 #define v16qi_UP V16QImode
 #define v8hi_UP  V8HImode
+#define v8hf_UP  V8HFmode
 #define v4si_UP  V4SImode
 #define v4sf_UP  V4SFmode
 #define v2di_UP  V2DImode
@@ -839,6 +840,7 @@ arm_init_simd_builtin_types (void)
   /* Continue with standard types.  */
   arm_simd_types[Float16x4_t].eltype = arm_simd_floatHF_type_node;
   arm_simd_types[Float32x2_t].eltype = float_type_node;
+  arm_simd_types[Float16x8_t].eltype = arm_simd_floatHF_type_node;
   arm_simd_types[Float32x4_t].eltype = float_type_node;
 
   for (i = 0; i < nelts; i++)
diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def
index bcbd20b..b178ae6 100644
--- a/gcc/config/arm/arm-simd-builtin-types.def
+++ b/gcc/config/arm/arm-simd-builtin-types.def
@@ -44,5 +44,7 @@
 
   ENTRY (Float16x4_t, V4HF, none, 64, float16, 18)
   ENTRY (Float32x2_t, V2SF, none, 64, float32, 18)
+
+  ENTRY (Float16x8_t, V8HF, none, 128, float16, 19)
   ENTRY (Float32x4_t, V4SF, none, 128, float32, 19)
 
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 4181f12..9de63fa 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -26367,7 +26367,8 @@ arm_vector_mode_supported_p (machine_mode mode)
 {
   /* Neon also supports V2SImode, etc. listed in the clause below.  */
   if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
-      || mode == V4HFmode || mode == V16QImode || mode == V4SFmode || mode == V2DImode))
+      || mode ==V4HFmode || mode == V16QImode || mode == V4SFmode
+      || mode == V2DImode || mode == V8HFmode))
     return true;
 
   if ((TARGET_NEON || TARGET_IWMMXT)
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 8c10ea3..f0ef33f 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -1104,7 +1104,7 @@ extern int arm_arch_crc;
 /* Modes valid for Neon Q registers.  */
 #define VALID_NEON_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
-   || (MODE) == V4SFmode || (MODE) == V2DImode)
+   || (MODE) == V8HFmode || (MODE) == V4SFmode || (MODE) == V2DImode)
 
 /* Structure modes valid for Neon registers.  */
 #define VALID_NEON_STRUCT_MODE(MODE) \
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index b4100c8..a958f63 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -58,6 +58,7 @@ typedef __simd128_int8_t int8x16_t;
 typedef __simd128_int16_t int16x8_t;
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
+typedef __simd128_float16_t float16x8_t;
 typedef __simd128_float32_t float32x4_t;
 typedef __simd128_poly8_t poly8x16_t;
 typedef __simd128_poly16_t poly16x8_t;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/14][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low|high, vcombine
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (2 preceding siblings ...)
  2015-04-22 16:59 ` [PATCH 3/14][ARM] Add float16x8_t intrinsics Alan Lawrence
@ 2015-04-22 16:59 ` Alan Lawrence
  2015-04-22 17:06 ` [PATCH 5/14][AArch64] Add basic fp16 support Alan Lawrence
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 16:59 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2732 bytes --]

This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01440.html ; 
changes are to add in several missing vst... intrinsics, and fix a missing 
iterator V_uf_sclr used in vec_extract.

These intrinsics are all made from patterns in neon.md, and are all tied 
together by iterators - I've tried to reduce coupling where I can.

Cross-tested check-gcc on arm-none-eabi;
Bootstrapped + check-gcc on arm-none-linux-gnueabihf.

gcc/ChangeLog:

	* config/arm/arm-builtins.c (VAR11, VAR12): New.
	* config/arm/arm_neon_builtins.def (vcombine, vld2_dup, vld3_dup,
	vld4_dup): Add v4hf variant.
	(vget_high, vget_low): Add v8hf variant.
	(vld1, vst1, vst1_lane, vld2, vld2_lane, vst2, vst2_lane, vld3,
	vld3_lane, vst3, vst3_lane, vld4, vld4_lane, vst4, vst4_lane): Add
	v4hf and v8hf variants.

	* config/arm/iterators.md (VD_LANE, VD_RE, VQ2, VQ_HS): New.
	(VDX): Add V4HF.
	(V_DOUBLE): Add case for V4HF.
	(VQX): Add V8HF.
	(V_HALF): Add case for V8HF.
	(VDQX): Add V4HF, V8HF.
	(V_elem, V_two_elem, V_three_elem, V_four_elem, V_cmp_result,
	V_uf_sclr, V_sz_elem, V_mode_nunits, q): Add cases for V4HF & V8HF.

	* config/arm/neon.md (vec_set<mode>internal, vec_extract<mode>,
	neon_vget_lane<mode>_sext_internal, neon_vget_lane<mode>_zext_internal,
	vec_load_lanesoi<mode>, neon_vld2<mode>, vec_store_lanesoi<mode>,
	neon_vst2<mode>, vec_load_lanesci<mode>, neon_vld3<mode>,
	neon_vld3qa<mode>, neon_vld3qb<mode>, vec_store_lanesci<mode>,
	neon_vst3<mode>, neon_vst3qa<mode>, neon_vst3qb<mode>,
	vec_load_lanesxi<mode>, neon_vld4<mode>, neon_vld4qa<mode>,
	neon_vld4qb<mode>, vec_store_lanesxi<mode>, neon_vst4<mode>,
	neon_vst4qa<mode>, neon_vst4qb<mode>): Change VQ iterator to VQ2.

	(neon_vcreate, neon_vreinterpretv8qi<mode>,
	neon_vreinterpretv4hi<mode>, neon_vreinterpretv2si<mode>,
	neon_vreinterpretv2sf<mode>, neon_vreinterpretdi<mode>):
	Change VDX to VD_RE.

	(neon_vld2_lane<mode>, neon_vst2_lane<mode>, neon_vld3_lane<mode>,
	neon_vst3_lane<mode>, neon_vld4_lane<mode>, neon_vst4_lane<mode>):
	Change VD iterator to VD_LANE, and VMQ iterator to VQ_HS.

	* config/arm/arm_neon.h (float16x4x2_t, float16x8x2_t, float16x4x3_t,
	float16x8x3_t, float16x4x4_t, float16x8x4_t, vcombine_f16,
	vget_high_f16, vget_low_f16, vld1_f16, vld1q_f16, vst1_f16, vst1q_f16,
	vst1_lane_f16, vst1q_lane_f16, vld2_f16, vld2q_f16, vld2_lane_f16,
	vld2q_lane_f16, vld2_dup_f16, vst2_f16, vst2q_f16, vst2_lane_f16,
	vst2q_lane_f16, vld3_f16, vld3q_f16, vld3_lane_f16, vld3q_lane_f16,
	vld3_dup_f16, vst3_f16, vst3q_f16, vst3_lane_f16, vst3q_lane_f16,
	vld4_f16, vld4q_f16, vld4_lane_f16, vld4q_lane_f16, vld4_dup_f16,
	vst4_f16, vst4q_f16, vst4_lane_f16, vst4q_lane_f16, ): New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 04_arm_vldst_etc.patch --]
[-- Type: text/x-patch; name=04_arm_vldst_etc.patch, Size: 48211 bytes --]

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 9855b86202e80816bada565786f35dd21fe68c91..4c3f0e888969f16ff6c84e2a1bf65321d73ec8b4 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -228,6 +228,12 @@ typedef struct {
 #define VAR10(T, N, A, B, C, D, E, F, G, H, I, J) \
   VAR9 (T, N, A, B, C, D, E, F, G, H, I) \
   VAR1 (T, N, J)
+#define VAR11(T, N, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR10 (T, N, A, B, C, D, E, F, G, H, I, J) \
+  VAR1 (T, N, K)
+#define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR1 (T, N, L)
 
 /* The NEON builtin data can be found in arm_neon_builtins.def.
    The mode entries in the following table correspond to the "key" type of the
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index e308bd4b7a404d869a7d125e8c5ddc4fb16ec884..1892fac503582366105b7dae0ac5e4da16698648 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -162,6 +162,16 @@ typedef struct uint64x2x2_t
   uint64x2_t val[2];
 } uint64x2x2_t;
 
+typedef struct float16x4x2_t
+{
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t
+{
+  float16x8_t val[2];
+} float16x8x2_t;
+
 typedef struct float32x2x2_t
 {
   float32x2_t val[2];
@@ -288,6 +298,16 @@ typedef struct uint64x2x3_t
   uint64x2_t val[3];
 } uint64x2x3_t;
 
+typedef struct float16x4x3_t
+{
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t
+{
+  float16x8_t val[3];
+} float16x8x3_t;
+
 typedef struct float32x2x3_t
 {
   float32x2_t val[3];
@@ -414,6 +434,16 @@ typedef struct uint64x2x4_t
   uint64x2_t val[4];
 } uint64x2x4_t;
 
+typedef struct float16x4x4_t
+{
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t
+{
+  float16x8_t val[4];
+} float16x8x4_t;
+
 typedef struct float32x2x4_t
 {
   float32x2_t val[4];
@@ -6063,6 +6093,12 @@ vcombine_s64 (int64x1_t __a, int64x1_t __b)
   return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcombine_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vcombinev4hf (__a, __b);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vcombine_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -6137,6 +6173,12 @@ vget_high_s64 (int64x2_t __a)
   return (int64x1_t)__builtin_neon_vget_highv2di (__a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_high_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vget_highv8hf (__a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_high_f32 (float32x4_t __a)
 {
@@ -6197,6 +6239,12 @@ vget_low_s32 (int32x4_t __a)
   return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_low_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vget_lowv8hf (__a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_low_f32 (float32x4_t __a)
 {
@@ -8744,6 +8792,12 @@ vld1_s64 (const int64_t * __a)
   return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_f16 (const float16_t * __a)
+{
+  return __builtin_neon_vld1v4hf ((const __builtin_neon_hf *) __a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_f32 (const float32_t * __a)
 {
@@ -8818,6 +8872,12 @@ vld1q_s64 (const int64_t * __a)
   return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_f16 (const float16_t * __a)
+{
+  return __builtin_neon_vld1v8hf ((const __builtin_neon_hf *) __a);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_f32 (const float32_t * __a)
 {
@@ -9213,6 +9273,12 @@ vst1_s64 (int64_t * __a, int64x1_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f16 (float16_t * __a, float16x4_t __b)
+{
+  __builtin_neon_vst1v4hf ((__builtin_neon_hf *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_f32 (float32_t * __a, float32x2_t __b)
 {
   __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b);
@@ -9287,6 +9353,12 @@ vst1q_s64 (int64_t * __a, int64x2_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f16 (float16_t * __a, float16x8_t __b)
+{
+  __builtin_neon_vst1v8hf ((__builtin_neon_hf *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_f32 (float32_t * __a, float32x4_t __b)
 {
   __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b);
@@ -9347,6 +9419,12 @@ vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_f16 (float16_t * __a, float16x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4hf ((__builtin_neon_hf *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c);
@@ -9421,6 +9499,12 @@ vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_f16 (float16_t * __a, float16x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8hf ((__builtin_neon_hf *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c);
@@ -9500,6 +9584,14 @@ vld2_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_f16 (const float16_t * __a)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_f32 (const float32_t * __a)
 {
@@ -9598,6 +9690,14 @@ vld2q_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_f16 (const float16_t * __a)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_f32 (const float32_t * __a)
 {
@@ -9673,6 +9773,16 @@ vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
 {
@@ -9745,6 +9855,16 @@ vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
 {
@@ -9805,6 +9925,13 @@ vld2_dup_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_f16 (const float16_t * __a)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_dup_f32 (const float32_t * __a)
 {
@@ -9901,6 +10028,13 @@ vst2_s32 (int32_t * __a, int32x2x2_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_f16 (float16_t * __a, float16x4x2_t __b)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v4hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2_f32 (float32_t * __a, float32x2x2_t __b)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9987,6 +10121,13 @@ vst2q_s32 (int32_t * __a, int32x4x2_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_f16 (float16_t * __a, float16x8x2_t __b)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v8hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f32 (float32_t * __a, float32x4x2_t __b)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10050,6 +10191,13 @@ vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10106,6 +10254,13 @@ vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10157,6 +10312,14 @@ vld3_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_f16 (const float16_t * __a)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_f32 (const float32_t * __a)
 {
@@ -10255,6 +10418,14 @@ vld3q_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_f16 (const float16_t * __a)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_f32 (const float32_t * __a)
 {
@@ -10330,6 +10501,16 @@ vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
 {
@@ -10402,6 +10583,16 @@ vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
 {
@@ -10462,6 +10653,14 @@ vld3_dup_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_f16 (const float16_t * __a)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_dup_f32 (const float32_t * __a)
 {
@@ -10558,6 +10757,13 @@ vst3_s32 (int32_t * __a, int32x2x3_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_f16 (float16_t * __a, float16x4x3_t __b)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v4hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3_f32 (float32_t * __a, float32x2x3_t __b)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10644,6 +10850,13 @@ vst3q_s32 (int32_t * __a, int32x4x3_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_f16 (float16_t * __a, float16x8x3_t __b)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v8hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f32 (float32_t * __a, float32x4x3_t __b)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10707,6 +10920,13 @@ vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10763,6 +10983,13 @@ vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10814,6 +11041,14 @@ vld4_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_f16 (const float16_t * __a)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_f32 (const float32_t * __a)
 {
@@ -10912,6 +11147,14 @@ vld4q_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_f16 (const float16_t * __a)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_f32 (const float32_t * __a)
 {
@@ -10987,6 +11230,16 @@ vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
 {
@@ -11059,6 +11312,16 @@ vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
 {
@@ -11119,6 +11382,14 @@ vld4_dup_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_f16 (const float16_t * __a)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_dup_f32 (const float32_t * __a)
 {
@@ -11215,6 +11486,13 @@ vst4_s32 (int32_t * __a, int32x2x4_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_f16 (float16_t * __a, float16x4x4_t __b)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v4hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4_f32 (float32_t * __a, float32x2x4_t __b)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11301,6 +11579,13 @@ vst4q_s32 (int32_t * __a, int32x4x4_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_f16 (float16_t * __a, float16x8x4_t __b)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v8hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f32 (float32_t * __a, float32x4x4_t __b)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11364,6 +11649,13 @@ vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11420,6 +11712,13 @@ vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f150b98b8096e94c6b39bbe477e5052b15f0313f..0b719df760747af7642bd14ab14a9b2144d43359 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -164,9 +164,9 @@ VAR10 (UNOP, vdup_n,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
 VAR10 (GETLANE, vdup_lane,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR5 (COMBINE, vcombine, v8qi, v4hi, v2si, v2sf, di)
-VAR5 (UNOP, vget_high, v16qi, v8hi, v4si, v4sf, v2di)
-VAR5 (UNOP, vget_low, v16qi, v8hi, v4si, v4sf, v2di)
+VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
 VAR3 (UNOP, vmovn, v8hi, v4si, v2di)
 VAR3 (UNOP, vqmovns, v8hi, v4si, v2di)
 VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
@@ -242,40 +242,40 @@ VAR6 (UNOP, vreinterpretv4si, v16qi, v8hi, v4si, v4sf, v2di, ti)
 VAR6 (UNOP, vreinterpretv4sf, v16qi, v8hi, v4si, v4sf, v2di, ti)
 VAR6 (UNOP, vreinterpretv2di, v16qi, v8hi, v4si, v4sf, v2di, ti)
 VAR6 (UNOP, vreinterpretti, v16qi, v8hi, v4si, v4sf, v2di, ti)
-VAR10 (LOAD1, vld1,
-        v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
+VAR12 (LOAD1, vld1,
+        v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
 VAR10 (LOAD1LANE, vld1_lane,
 	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
 VAR10 (LOAD1, vld1_dup,
 	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR10 (STORE1, vst1,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR10 (STORE1LANE, vst1_lane,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR9 (LOAD1, vld2,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (LOAD1LANE, vld2_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR5 (LOAD1, vld2_dup, v8qi, v4hi, v2si, v2sf, di)
-VAR9 (STORE1, vst2,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (STORE1LANE, vst2_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR9 (LOAD1, vld3,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (LOAD1LANE, vld3_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR5 (LOAD1, vld3_dup, v8qi, v4hi, v2si, v2sf, di)
-VAR9 (STORE1, vst3,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (STORE1LANE, vst3_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR9 (LOAD1, vld4,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (LOAD1LANE, vld4_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR5 (LOAD1, vld4_dup, v8qi, v4hi, v2si, v2sf, di)
-VAR9 (STORE1, vst4,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (STORE1LANE, vst4_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR12 (STORE1, vst1,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR12 (STORE1LANE, vst1_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR11 (LOAD1, vld2,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (LOAD1LANE, vld2_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR11 (STORE1, vst2,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (STORE1LANE, vst2_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR11 (LOAD1, vld3,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (LOAD1LANE, vld3_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR11 (STORE1, vst3,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (STORE1LANE, vst3_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR11 (LOAD1, vld4,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (LOAD1LANE, vld4_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR11 (STORE1, vst4,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (STORE1LANE, vst4_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index f7f8ab7f0d64489bef7dea94762437649327b8ef..b9e043bf2e3ff7b28e466be5ffbfd7ba9539dbdb 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -65,20 +65,32 @@
 ;; Integer modes supported by Neon and IWMMXT, except V2DI
 (define_mode_iterator VINTW [V2SI V4HI V8QI V4SI V8HI V16QI])
 
-;; Double-width vector modes.
+;; Double-width vector modes, on which we support arithmetic (no HF!)
 (define_mode_iterator VD [V8QI V4HI V2SI V2SF])
 
+;; Double-width vector modes plus 64-bit elements for vreinterpret + vcreate.
+(define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI])
+
 ;; Double-width vector modes plus 64-bit elements.
-(define_mode_iterator VDX [V8QI V4HI V2SI V2SF DI])
+(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI])
+
+;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane.
+(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF])
 
 ;; Double-width vector modes without floating-point elements.
 (define_mode_iterator VDI [V8QI V4HI V2SI])
 
-;; Quad-width vector modes.
+;; Quad-width vector modes supporting arithmetic (no HF!).
 (define_mode_iterator VQ [V16QI V8HI V4SI V4SF])
 
+;; Quad-width vector modes, including V8HF.
+(define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF])
+
+;; Quad-width vector modes with 16- or 32-bit elements
+(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
+
 ;; Quad-width vector modes plus 64-bit elements.
-(define_mode_iterator VQX [V16QI V8HI V4SI V4SF V2DI])
+(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
 
 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
@@ -111,7 +123,8 @@
 (define_mode_iterator VDQI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
 
 ;; Vector modes, including 64-bit integer elements.
-(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF DI V2DI])
+(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI
+			    V4HF V8HF V2SF V4SF DI V2DI])
 
 ;; Vector modes including 64-bit integer elements, but no floats.
 (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
@@ -348,7 +361,8 @@
 
 ;; Define element mode for each vector mode.
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
-              (V4HI "HI") (V8HI "HI")
+			  (V4HI "HI") (V8HI "HI")
+			  (V4HF "HF") (V8HF "HF")
                           (V2SI "SI") (V4SI "SI")
                           (V2SF "SF") (V4SF "SF")
                           (DI "DI")   (V2DI "DI")])
@@ -365,6 +379,7 @@
 ;; size for structure lane/dup loads and stores.
 (define_mode_attr V_two_elem [(V8QI "HI")   (V16QI "HI")
                               (V4HI "SI")   (V8HI "SI")
+                              (V4HF "SF")   (V8HF "SF")
                               (V2SI "V2SI") (V4SI "V2SI")
                               (V2SF "V2SF") (V4SF "V2SF")
                               (DI "V2DI")   (V2DI "V2DI")])
@@ -372,6 +387,7 @@
 ;; Similar, for three elements.
 (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
                                 (V4HI "BLK") (V8HI "BLK")
+                                (V4HF "BLK") (V8HF "BLK")
                                 (V2SI "BLK") (V4SI "BLK")
                                 (V2SF "BLK") (V4SF "BLK")
                                 (DI "EI")    (V2DI "EI")])
@@ -379,6 +395,7 @@
 ;; Similar, for four elements.
 (define_mode_attr V_four_elem [(V8QI "SI")   (V16QI "SI")
                                (V4HI "V4HI") (V8HI "V4HI")
+                               (V4HF "V4HF") (V8HF "V4HF")
                                (V2SI "V4SI") (V4SI "V4SI")
                                (V2SF "V4SF") (V4SF "V4SF")
                                (DI "OI")     (V2DI "OI")])
@@ -403,7 +420,8 @@
 
 ;; Modes with half the number of equal-sized elements.
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
-              (V4SI  "V2SI") (V4SF "V2SF") (V2DF "DF")
+			  (V8HF "V4HF") (V4SI  "V2SI")
+			  (V4SF "V2SF") (V2DF "DF")
                           (V2DI "DI")])
 
 ;; Same, but lower-case.
@@ -413,8 +431,9 @@
 
 ;; Modes with twice the number of equal-sized elements.
 (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
-                (V2SI "V4SI") (V2SF "V4SF") (DF "V2DF")
-                            (DI "V2DI")])
+			    (V2SI "V4SI") (V4HF "V8HF")
+			    (V2SF "V4SF") (DF "V2DF")
+			    (DI "V2DI")])
 
 ;; Same, but lower-case.
 (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
@@ -436,8 +455,9 @@
 
 ;; Mode of result of comparison operations (and bit-select operand 1).
 (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
-                    (V4HI "V4HI") (V8HI  "V8HI")
+				(V4HI "V4HI") (V8HI  "V8HI")
                                 (V2SI "V2SI") (V4SI  "V4SI")
+				(V4HF "V4HI") (V8HF  "V8HI")
                                 (V2SF "V2SI") (V4SF  "V4SI")
                                 (DI   "DI")   (V2DI  "V2DI")])
 
@@ -474,12 +494,14 @@
 (define_mode_attr V_uf_sclr [(V8QI "u8")  (V16QI "u8")
                  (V4HI "u16") (V8HI "u16")
                              (V2SI "32") (V4SI "32")
+                             (V4HF "u16") (V8HF "u16")
                              (V2SF "32") (V4SF "32")])
 
 (define_mode_attr V_sz_elem [(V8QI "8")  (V16QI "8")
                  (V4HI "16") (V8HI  "16")
                              (V2SI "32") (V4SI  "32")
                              (DI   "64") (V2DI  "64")
+			     (V4HF "16") (V8HF "16")
                  (V2SF "32") (V4SF  "32")])
 
 (define_mode_attr V_elem_ch [(V8QI "b")  (V16QI "b")
@@ -546,6 +568,7 @@
                             (DI   "true") (V2DI  "false")])
 
 (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
+				 (V4HF "4") (V8HF "8")
                                  (V4HI "4") (V8HI "8")
                                  (V2SI "2") (V4SI "4")
                                  (V2SF "2") (V4SF "4")
@@ -589,6 +612,7 @@
 (define_mode_attr q [(V8QI "") (V16QI "_q")
                      (V4HI "") (V8HI "_q")
                      (V2SI "") (V4SI "_q")
+		     (V4HF "") (V8HF "_q")
                      (V2SF "") (V4SF "_q")
                      (DI "")   (V2DI "_q")
                      (DF "")   (V2DF "_q")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index bf620c4173a771f927b74fed216c0bd0b700370d..d585c9187ad2902e10687e52e235046dd944de3d 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -320,11 +320,11 @@
   [(set_attr "type" "neon_load1_all_lanes<q>,neon_from_gp<q>")])
 
 (define_insn "vec_set<mode>_internal"
-  [(set (match_operand:VQ 0 "s_register_operand" "=w,w")
-        (vec_merge:VQ
-          (vec_duplicate:VQ
+  [(set (match_operand:VQ2 0 "s_register_operand" "=w,w")
+        (vec_merge:VQ2
+          (vec_duplicate:VQ2
             (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r"))
-          (match_operand:VQ 3 "s_register_operand" "0,0")
+          (match_operand:VQ2 3 "s_register_operand" "0,0")
           (match_operand:SI 2 "immediate_operand" "i,i")))]
   "TARGET_NEON"
 {
@@ -407,7 +407,7 @@
 (define_insn "vec_extract<mode>"
   [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r")
 	(vec_select:<V_elem>
-          (match_operand:VQ 1 "s_register_operand" "w,w")
+          (match_operand:VQ2 1 "s_register_operand" "w,w")
           (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))]
   "TARGET_NEON"
 {
@@ -2666,7 +2666,7 @@
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(sign_extend:SI
 	  (vec_select:<V_elem>
-	    (match_operand:VQ 1 "s_register_operand" "w")
+	    (match_operand:VQ2 1 "s_register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
   "TARGET_NEON"
 {
@@ -2693,7 +2693,7 @@
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(zero_extend:SI
 	  (vec_select:<V_elem>
-	    (match_operand:VQ 1 "s_register_operand" "w")
+	    (match_operand:VQ2 1 "s_register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
   "TARGET_NEON"
 {
@@ -2833,7 +2833,7 @@
 })
 
 (define_expand "neon_vcreate<mode>"
-  [(match_operand:VDX 0 "s_register_operand" "")
+  [(match_operand:VD_RE 0 "s_register_operand" "")
    (match_operand:DI 1 "general_operand" "")]
   "TARGET_NEON"
 {
@@ -4184,7 +4184,7 @@
 
 (define_expand "neon_vreinterpretv8qi<mode>"
   [(match_operand:V8QI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4193,7 +4193,7 @@
 
 (define_expand "neon_vreinterpretv4hi<mode>"
   [(match_operand:V4HI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4202,7 +4202,7 @@
 
 (define_expand "neon_vreinterpretv2si<mode>"
   [(match_operand:V2SI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4211,7 +4211,7 @@
 
 (define_expand "neon_vreinterpretv2sf<mode>"
   [(match_operand:V2SF 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4220,7 +4220,7 @@
 
 (define_expand "neon_vreinterpretdi<mode>"
   [(match_operand:DI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4479,14 +4479,14 @@
 (define_expand "vec_load_lanesoi<mode>"
   [(set (match_operand:OI 0 "s_register_operand")
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 		   UNSPEC_VLD2))]
   "TARGET_NEON")
 
 (define_insn "neon_vld2<mode>"
   [(set (match_operand:OI 0 "s_register_operand" "=w")
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD2))]
   "TARGET_NEON"
   "vld2.<V_sz_elem>\t%h0, %A1"
@@ -4497,7 +4497,7 @@
         (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:TI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD2_LANE))]
   "TARGET_NEON"
 {
@@ -4522,7 +4522,7 @@
         (unspec:OI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:OI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD2_LANE))]
   "TARGET_NEON"
 {
@@ -4593,14 +4593,14 @@
 (define_expand "vec_store_lanesoi<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand")
 	(unspec:OI [(match_operand:OI 1 "s_register_operand")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST2))]
   "TARGET_NEON")
 
 (define_insn "neon_vst2<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
 	(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
-		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 		   UNSPEC_VST2))]
   "TARGET_NEON"
   "vst2.<V_sz_elem>\t%h1, %A0"
@@ -4612,7 +4612,7 @@
 	(unspec:<V_two_elem>
 	  [(match_operand:TI 1 "s_register_operand" "w")
 	   (match_operand:SI 2 "immediate_operand" "i")
-	   (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+	   (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 	  UNSPEC_VST2_LANE))]
   "TARGET_NEON"
 {
@@ -4637,7 +4637,7 @@
         (unspec:<V_two_elem>
            [(match_operand:OI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST2_LANE))]
   "TARGET_NEON"
 {
@@ -4690,7 +4690,7 @@
 (define_expand "vec_load_lanesci<mode>"
   [(match_operand:CI 0 "s_register_operand")
    (match_operand:CI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vld3<mode> (operands[0], operands[1]));
@@ -4700,7 +4700,7 @@
 (define_expand "neon_vld3<mode>"
   [(match_operand:CI 0 "s_register_operand")
    (match_operand:CI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -4715,7 +4715,7 @@
 (define_insn "neon_vld3qa<mode>"
   [(set (match_operand:CI 0 "s_register_operand" "=w")
         (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3A))]
   "TARGET_NEON"
 {
@@ -4735,7 +4735,7 @@
   [(set (match_operand:CI 0 "s_register_operand" "=w")
         (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
                     (match_operand:CI 2 "s_register_operand" "0")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3B))]
   "TARGET_NEON"
 {
@@ -4756,7 +4756,7 @@
         (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:EI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3_LANE))]
   "TARGET_NEON"
 {
@@ -4783,7 +4783,7 @@
         (unspec:CI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:CI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3_LANE))]
   "TARGET_NEON"
 {
@@ -4863,7 +4863,7 @@
 (define_expand "vec_store_lanesci<mode>"
   [(match_operand:CI 0 "neon_struct_operand")
    (match_operand:CI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vst3<mode> (operands[0], operands[1]));
@@ -4873,7 +4873,7 @@
 (define_expand "neon_vst3<mode>"
   [(match_operand:CI 0 "neon_struct_operand")
    (match_operand:CI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -4888,7 +4888,7 @@
 (define_insn "neon_vst3qa<mode>"
   [(set (match_operand:EI 0 "neon_struct_operand" "=Um")
         (unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST3A))]
   "TARGET_NEON"
 {
@@ -4907,7 +4907,7 @@
 (define_insn "neon_vst3qb<mode>"
   [(set (match_operand:EI 0 "neon_struct_operand" "=Um")
         (unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST3B))]
   "TARGET_NEON"
 {
@@ -4928,7 +4928,7 @@
         (unspec:<V_three_elem>
            [(match_operand:EI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST3_LANE))]
   "TARGET_NEON"
 {
@@ -4955,7 +4955,7 @@
         (unspec:<V_three_elem>
            [(match_operand:CI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST3_LANE))]
   "TARGET_NEON"
 {
@@ -5010,7 +5010,7 @@
 (define_expand "vec_load_lanesxi<mode>"
   [(match_operand:XI 0 "s_register_operand")
    (match_operand:XI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
@@ -5020,7 +5020,7 @@
 (define_expand "neon_vld4<mode>"
   [(match_operand:XI 0 "s_register_operand")
    (match_operand:XI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -5035,7 +5035,7 @@
 (define_insn "neon_vld4qa<mode>"
   [(set (match_operand:XI 0 "s_register_operand" "=w")
         (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4A))]
   "TARGET_NEON"
 {
@@ -5056,7 +5056,7 @@
   [(set (match_operand:XI 0 "s_register_operand" "=w")
         (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
                     (match_operand:XI 2 "s_register_operand" "0")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4B))]
   "TARGET_NEON"
 {
@@ -5078,7 +5078,7 @@
         (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:OI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4_LANE))]
   "TARGET_NEON"
 {
@@ -5106,7 +5106,7 @@
         (unspec:XI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:XI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4_LANE))]
   "TARGET_NEON"
 {
@@ -5191,7 +5191,7 @@
 (define_expand "vec_store_lanesxi<mode>"
   [(match_operand:XI 0 "neon_struct_operand")
    (match_operand:XI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
@@ -5201,7 +5201,7 @@
 (define_expand "neon_vst4<mode>"
   [(match_operand:XI 0 "neon_struct_operand")
    (match_operand:XI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -5216,7 +5216,7 @@
 (define_insn "neon_vst4qa<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
         (unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST4A))]
   "TARGET_NEON"
 {
@@ -5236,7 +5236,7 @@
 (define_insn "neon_vst4qb<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
         (unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST4B))]
   "TARGET_NEON"
 {
@@ -5258,7 +5258,7 @@
         (unspec:<V_four_elem>
            [(match_operand:OI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST4_LANE))]
   "TARGET_NEON"
 {
@@ -5286,7 +5286,7 @@
         (unspec:<V_four_elem>
            [(match_operand:XI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST4_LANE))]
   "TARGET_NEON"
 {

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 5/14][AArch64] Add basic fp16 support
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (3 preceding siblings ...)
  2015-04-22 16:59 ` [PATCH 4/14][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low|high, vcombine Alan Lawrence
@ 2015-04-22 17:06 ` Alan Lawrence
  2015-04-29 21:22   ` Joseph Myers
  2015-04-22 17:11 ` [PATCH 6/14][AArch64] Add support for float16x{4,8}_t vectors/builtins Alan Lawrence
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:06 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2755 bytes --]

[Resending with correct in-reply-to header]

This adds basic support for moving __fp16 values around, passing and returning, 
and operating on them by promoting to 32-bit floats. Also a few scalar testcases.

Note I've not got an fmov (immediate) variant, because there is no 'fmov h<n>, 
...' - the only way to load a 16-bit immediate is to reinterpret the bit pattern 
into some other type. Vector MOVs are turned off for the same reason. If this is 
practical it can follow in a separate patch.


My reading of ACLE suggests the type name to use is __fp16, rather than 
__builtin_aarch64_simd_hf. I can use the latter if that's preferable?


int<->f16 conversions are a little odd, assembly

int_to_f16: scvtf d0, w0 fcvt h0, d0 ret

int_from_f16: fcvt s0, h0 fcvtzs w0, s0 ret

The spec is silent on the absence or existence of intermediate rounding steps, 
however, I don't think this matters: even float32_t offers soooo many more bits 
than __fp16, that any integer which fits into the range of an __fp16 (i.e. is 
not infinite), can be expressed exactly as a float32_t without any loss of 
precision. So I think the above are OK. (if they can be optimized, that can 
follow in a later patch.)


Note that unlike ARM, where we support both IEEE and Alternative formats (and, 
somewhat-awkwardly, format-agnostic code too), here we are settling on IEEE 
format always. Technically, we should output an EABI attribute saying which 
format we are using here, however, aarch64 asm does not support the 
.eabi-attribute directive yet, so it seems reasonable to leave this while there 
is only one possible format.


Bootstrapped + check-gcc on aarch64-none-linux-gnu.

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (aarch64_fp16_type_node): New.
	(aarch64_init_builtins): Make aarch64_fp16_type_node, use for __fp16.

	* config/aarch64/aarch64-modes.def: Add HFmode.

	* config/aarch64/aarch64.h (TARGET_CPU_CPP_BUILTINS): Define
	__ARM_FP16_FORMAT_IEEE and __ARM_FP16_ARGS. Set bit 1 of __ARM_FP.

	* config/aarch64/aarch64.c (aarch64_init_libfuncs,
	aarch64_promoted_type): New.

	(aarch64_float_const_representable_p): Disable HFmode.
	(aarch64_mangle_type): Mangle half-precision floats to "Dh".
	(TARGET_PROMOTED_TYPE): Define to aarch64_promoted_type.
	(TARGET_INIT_LIBFUNCS): Define to aarch64_init_libfuncs.

	* config/aarch64/aarch64.md (mov<mode>): Include HFmode using GPF_F16.
	(movhf_aarch64, extendhfsf2, extendhfdf2, truncsfhf2, truncdfhf2): New.

	* config/aarch64/iterators.md (GPF_F16): New.


gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/f16_convs_1.c: New test.
	* gcc.target/aarch64/f16_convs_2.c: New test.
	* gcc.target/aarch64/f16_movs_1.c: New test.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 05_aarch64_basic_fp16.patch --]
[-- Type: text/x-patch; name=05_aarch64_basic_fp16.patch, Size: 12772 bytes --]

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 87f1ac2ec1e3c774782c567b20c673802ae90d99..5a7b112bd1fe77826bfb84383c86dceb6b1521e3 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -453,6 +453,9 @@ static struct aarch64_simd_type_info aarch64_simd_types [] = {
 };
 #undef ENTRY
 
+/* This type is not SIMD-specific; it is the user-visible __fp16.  */
+static tree aarch64_fp16_type_node = NULL_TREE;
+
 static tree aarch64_simd_intOI_type_node = NULL_TREE;
 static tree aarch64_simd_intEI_type_node = NULL_TREE;
 static tree aarch64_simd_intCI_type_node = NULL_TREE;
@@ -862,6 +865,12 @@ aarch64_init_builtins (void)
     = add_builtin_function ("__builtin_aarch64_set_fpsr", ftype_set_fpr,
 			    AARCH64_BUILTIN_SET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
 
+  aarch64_fp16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_fp16_type_node) = 16;
+  layout_type (aarch64_fp16_type_node);
+
+  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, "__fp16");
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
   if (TARGET_CRC32)
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index b17b90d90601ae0a631a78560da743720c4638ce..c30059b632fa8cb7fd9071917d3f581f0966a86d 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -36,6 +36,10 @@ CC_MODE (CC_DLTU);
 CC_MODE (CC_DGEU);
 CC_MODE (CC_DGTU);
 
+/* Half-precision floating point for arm_neon.h float16_t.  */
+FLOAT_MODE (HF, 2, 0);
+ADJUST_FLOAT_FORMAT (HF, &ieee_half_format);
+
 /* Vector modes.  */
 VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI.  */
 VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI.  */
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index bf59e40a64459f6daddef47a5f5214adfd92d9b6..67c37ebc0e06d22e524322e5a82b6bcde550bd93 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -57,7 +57,9 @@
       if (TARGET_FLOAT)                                         \
         {                                                       \
           builtin_define ("__ARM_FEATURE_FMA");                 \
-          builtin_define_with_int_value ("__ARM_FP", 0x0C);     \
+	  builtin_define_with_int_value ("__ARM_FP", 0x0E);     \
+	  builtin_define ("__ARM_FP16_FORMAT_IEEE");		\
+	  builtin_define ("__ARM_FP16_ARGS");			\
         }                                                       \
       if (TARGET_SIMD)                                          \
         {                                                       \
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b923fdb08a8e653570e51cf516dc551955961704..44956cf0276ed7b1369d1816f472bad61ac421b1 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8058,6 +8058,10 @@ aarch64_mangle_type (const_tree type)
   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
     return "St9__va_list";
 
+  /* Half-precision float.  */
+  if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
+    return "Dh";
+
   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
      builtin types.  */
   if (TYPE_NAME (type) != NULL)
@@ -9251,6 +9255,33 @@ aarch64_start_file (void)
   default_file_start();
 }
 
+static void
+aarch64_init_libfuncs (void)
+{
+   /* Half-precision float operations.  The compiler handles all operations
+     with NULL libfuncs by converting to SFmode.  */
+
+  /* Conversions.  */
+  set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
+  set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
+
+  /* Arithmetic.  */
+  set_optab_libfunc (add_optab, HFmode, NULL);
+  set_optab_libfunc (sdiv_optab, HFmode, NULL);
+  set_optab_libfunc (smul_optab, HFmode, NULL);
+  set_optab_libfunc (neg_optab, HFmode, NULL);
+  set_optab_libfunc (sub_optab, HFmode, NULL);
+
+  /* Comparisons.  */
+  set_optab_libfunc (eq_optab, HFmode, NULL);
+  set_optab_libfunc (ne_optab, HFmode, NULL);
+  set_optab_libfunc (lt_optab, HFmode, NULL);
+  set_optab_libfunc (le_optab, HFmode, NULL);
+  set_optab_libfunc (ge_optab, HFmode, NULL);
+  set_optab_libfunc (gt_optab, HFmode, NULL);
+  set_optab_libfunc (unord_optab, HFmode, NULL);
+}
+
 /* Target hook for c_mode_for_suffix.  */
 static machine_mode
 aarch64_c_mode_for_suffix (char suffix)
@@ -9289,7 +9320,8 @@ aarch64_float_const_representable_p (rtx x)
   if (!CONST_DOUBLE_P (x))
     return false;
 
-  if (GET_MODE (x) == VOIDmode)
+  /* We don't support HFmode constants yet.  */
+  if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
     return false;
 
   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
@@ -11230,6 +11262,14 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
   return true;
 }
 
+/* Implement TARGET_PROMOTED_TYPE to promote float16 to 32 bits.  */
+static tree
+aarch64_promoted_type (const_tree t)
+{
+  if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
+    return float_type_node;
+  return NULL_TREE;
+}
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -11384,6 +11424,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
 #undef TARGET_SCHED_REASSOCIATION_WIDTH
 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
 
+#undef TARGET_PROMOTED_TYPE
+#define TARGET_PROMOTED_TYPE aarch64_promoted_type
+
 #undef TARGET_SECONDARY_RELOAD
 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
 
@@ -11476,6 +11519,8 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
   aarch64_vectorize_vec_perm_const_ok
 
+#undef TARGET_INIT_LIBFUNCS
+#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
 
 #undef TARGET_FIXED_CONDITION_CODE_REGS
 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1f4169ee76e7f3f321e5ed7a4d0f08b54ee3bf17..0851f6949adb69bf23221e811230fae08749887c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -976,8 +976,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF 0 "nonimmediate_operand" "")
-	(match_operand:GPF 1 "general_operand" ""))]
+  [(set (match_operand:GPF_F16 0 "nonimmediate_operand" "")
+	(match_operand:GPF_F16 1 "general_operand" ""))]
   ""
   "
     if (!TARGET_FLOAT)
@@ -991,6 +991,26 @@
   "
 )
 
+(define_insn "*movhf_aarch64"
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w, ?r,w,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "?rY, w,w,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
+    || register_operand (operands[1], HFmode))"
+  "@
+   mov\\t%0.h[0], %w1
+   umov\\t%w0, %1.h[0]
+   mov\\t%0.h[0], %1.h[0]
+   ldr\\t%h0, %1
+   str\\t%h1, %0
+   ldrh\\t%w0, %1
+   strh\\t%w1, %0
+   mov\\t%w0, %w1"
+  [(set_attr "type" "neon_from_gp,neon_to_gp,fmov,\
+                     f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,yes,yes,*,*,*,*,*")
+   (set_attr "fp"   "*,*,*,yes,yes,*,*,*")]
+)
+
 (define_insn "*movsf_aarch64"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
 	(match_operand:SF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
@@ -3882,6 +3902,22 @@
   [(set_attr "type" "f_cvt")]
 )
 
+(define_insn "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+        (float_extend:SF (match_operand:HF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%s0, %h1"
+  [(set_attr "type" "f_cvt")]
+)
+
+(define_insn "extendhfdf2"
+  [(set (match_operand:DF 0 "register_operand" "=w")
+        (float_extend:DF (match_operand:HF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%d0, %h1"
+  [(set_attr "type" "f_cvt")]
+)
+
 (define_insn "truncdfsf2"
   [(set (match_operand:SF 0 "register_operand" "=w")
         (float_truncate:SF (match_operand:DF 1 "register_operand" "w")))]
@@ -3890,6 +3926,22 @@
   [(set_attr "type" "f_cvt")]
 )
 
+(define_insn "truncsfhf2"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+        (float_truncate:HF (match_operand:SF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%h0, %s1"
+  [(set_attr "type" "f_cvt")]
+)
+
+(define_insn "truncdfhf2"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+        (float_truncate:HF (match_operand:DF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%h0, %d1"
+  [(set_attr "type" "f_cvt")]
+)
+
 (define_insn "fix_trunc<GPF:mode><GPI:mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
         (fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 65a2849155c9b331dc6179853501f0a6207d1773..a8b782b887ee914bd2399807d2ccfdf4a8e6433b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -38,6 +38,9 @@
 ;; Iterator for General Purpose Floating-point registers (32- and 64-bit modes)
 (define_mode_iterator GPF [SF DF])
 
+;; Iterator for General Purpose Float regs, inc float16_t.
+(define_mode_iterator GPF_F16 [HF SF DF])
+
 ;; Integer vector modes.
 (define_mode_iterator VDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/f16_convs_1.c b/gcc/testsuite/gcc.target/aarch64/f16_convs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d4e7c02db5e99068c9ddba1b5635e8904bf19e2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/f16_convs_1.c
@@ -0,0 +1,39 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-inline" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define EPSILON 0.0001
+
+__fp16
+convert_f32_to_f16 (float in)
+{
+  return in;
+}
+
+float
+convert_f16_to_f32 (__fp16 in)
+{
+  return in;
+}
+
+int
+main (int argc, char **argv)
+{
+  __fp16 in1 = convert_f32_to_f16 (3.14159f);
+  __fp16 in2 = convert_f32_to_f16 (2.718f);
+
+  /* Do the addition on __fp16's (implicitly converts both operands to
+     float32, adds, converts back to f16, then we convert back to f32).  */
+  float32_t result1 = convert_f16_to_f32 (in1 + in2);
+
+  /* Do the addition on float32's (we convert both operands to f32, and add,
+     as above, but skip the final conversion f32 -> f16 -> f32).  */
+  float32_t result2 = convert_f16_to_f32 (in1) + convert_f16_to_f32 (in2);
+
+  if (__builtin_fabs (result2 - result1) > EPSILON)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/f16_convs_2.c b/gcc/testsuite/gcc.target/aarch64/f16_convs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..3421daef13ff1992775e8c4299623be9779ac45c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/f16_convs_2.c
@@ -0,0 +1,39 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-inline" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define EPSILON 0.0001
+
+__fp16
+convert_to_f16 (int in)
+{
+  return in;
+}
+
+int
+convert_from_f16 (__fp16 in)
+{
+  return in;
+}
+
+int
+main (int argc, char **argv)
+{
+  __fp16 in1 = convert_to_f16 (3);
+  __fp16 in2 = convert_to_f16 (2);
+
+  /* Do the addition on __fp16's (implicitly converts both operands to
+     float32, adds, converts back to f16, then we convert to int).  */
+  int result1 = convert_from_f16 (in1 + in2);
+
+  /* Do the addition on int's (we convert both operands directly to int, add,
+     and we're done).  */
+  int result2 = convert_from_f16 (in1) + convert_from_f16 (in2);
+
+  if (__builtin_abs (result2 - result1) > EPSILON)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/f16_movs_1.c b/gcc/testsuite/gcc.target/aarch64/f16_movs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cb80866790c5c40a59d22f2bbbfce41ae5f07d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/f16_movs_1.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-fno-inline -O2" } */
+
+#include <arm_neon.h>
+
+__fp16
+func2 (__fp16 a, __fp16 b)
+{
+  return b;
+}
+
+int
+main (int argc, char **argv)
+{
+  __fp16 array[16];
+  int i;
+
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = i;
+
+  array[0] = func2 (array[1], array[2]);
+
+  __builtin_printf ("%f\n", array[0]); /* { dg-output "2.0" } */
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 6/14][AArch64] Add support for float16x{4,8}_t vectors/builtins
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (4 preceding siblings ...)
  2015-04-22 17:06 ` [PATCH 5/14][AArch64] Add basic fp16 support Alan Lawrence
@ 2015-04-22 17:11 ` Alan Lawrence
  2015-04-22 17:14 ` [PATCH 7/14][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate Alan Lawrence
                   ` (8 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:11 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1925 bytes --]

This adds some basic intrinsics - vget_lane, vset_lane, vld1_lane, vld1, vst1 - 
for float16 types, and the necessary support in the builtin generator, basic 
patterns for moving values around, etc. Other intrinsics will follow in later 
patches.

I've extended the existing testcases in aarch64/, but advsimd-intrinsics tests 
follow later in the series.

gcc/ChangeLog:

	* config/aarch64/aarch64.c (aarch64_vector_mode_supported_p): Support
	V4HFmode and V8HFmode.
	(aarch64_split_simd_move): Add case for V8HFmode.
	* config/aarch64/aarch64-builtins.c (v4hf_UP, v8hf_UP): Define.
	(aarch64_simd_builtin_std_type): Handle HFmode.
	(aarch64_init_simd_builtin_types): Include Float16x4_t and Float16x8_t.

	* config/aarch64/aarch64-simd.md (mov<mode>, aarch64_get_lane<mode>,
	aarch64_ld1<VALL:mode>, aarch64_st1<VALL:mode): Use VALL_F16 iterator.
	(aarch64_be_ld1<mode>, aarch64_be_st1<mode>): Use VALLDI_F16 iterator.

	* config/aarch64/aarch64-simd-builtin-types.def: Add Float16x4_t,
	Float16x8_t.

	* config/aarch64/aarch64-simd-builtins.def (ld1, st1): Use VALL_F16.
	* config/aarch64/arm_neon.h (float16x4_t, float16x8_t, float16_t):
	New typedefs.
	(vget_lane_f16, vgetq_lane_f16, vset_lane_f16, vsetq_lane_f16,
	vld1_f16, vld1q_f16, vst1_f16, vst1q_f16, vst1_lane_f16,
	vst1q_lane_f16): New.
	* config/aarch64/iterators.md (VD, VQ, VQ_NO2E): Add vectors of HFmode.
	(VALLDI_F16, VALL_F16): New.
	(Vmtype, VEL, VCONQ, VHALF, VRL3, VRL4, V_TWO_ELEM, V_THREE_ELEM,
	V_FOUR_ELEM, q): Add cases for V4HF and V8HF.
	(VDBL, VRL2): Add V4HF case.

gcc/testsuite/ChangeLog:

	* g++.dg/abi/mangle-neon-aarch64.C: Add cases for float16x4_t and
	float16x8_t.
	* gcc.target/aarch64/vset_lane_1.c: Likewise.
	* gcc.target/aarch64/vld1-vst1_1.c: Likewise, also missing float32x4_t.
	* gcc.target/aarch64/vld1_lane.c: Remove unused constants; add cases
	for float16x4_t and float16x8_t.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 06_aarch64_f16_vectors_builtins.patch --]
[-- Type: text/x-patch; name=06_aarch64_f16_vectors_builtins.patch, Size: 26198 bytes --]

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index d554735ab480f9e9b1f49fd3510555197bb7b5f4..6544643a3cd1dd46b440eca0e1a05bad4c499262 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -63,6 +63,7 @@
 
 #define v8qi_UP  V8QImode
 #define v4hi_UP  V4HImode
+#define v4hf_UP  V4HFmode
 #define v2si_UP  V2SImode
 #define v2sf_UP  V2SFmode
 #define v1df_UP  V1DFmode
@@ -70,6 +71,7 @@
 #define df_UP    DFmode
 #define v16qi_UP V16QImode
 #define v8hi_UP  V8HImode
+#define v8hf_UP  V8HFmode
 #define v4si_UP  V4SImode
 #define v4sf_UP  V4SFmode
 #define v2di_UP  V2DImode
@@ -522,6 +524,8 @@ aarch64_simd_builtin_std_type (enum machine_mode mode,
       return aarch64_simd_intCI_type_node;
     case XImode:
       return aarch64_simd_intXI_type_node;
+    case HFmode:
+      return aarch64_fp16_type_node;
     case SFmode:
       return float_type_node;
     case DFmode:
@@ -606,6 +610,8 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Poly64x2_t].eltype = aarch64_simd_types[Poly64_t].itype;
 
   /* Continue with standard types.  */
+  aarch64_simd_types[Float16x4_t].eltype = aarch64_fp16_type_node;
+  aarch64_simd_types[Float16x8_t].eltype = aarch64_fp16_type_node;
   aarch64_simd_types[Float32x2_t].eltype = float_type_node;
   aarch64_simd_types[Float32x4_t].eltype = float_type_node;
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index b85a23109efae6301931f12c6b665015af570fb7..ef8f20574c52170facfe67fc9fa433dc64926bca 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -44,6 +44,8 @@
   ENTRY (Poly16x8_t, V8HI, poly, 12)
   ENTRY (Poly64x1_t, DI, poly, 12)
   ENTRY (Poly64x2_t, V2DI, poly, 12)
+  ENTRY (Float16x4_t, V4HF, none, 13)
+  ENTRY (Float16x8_t, V8HF, none, 13)
   ENTRY (Float32x2_t, V2SF, none, 13)
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index af39d9c2b42eea0bc45ea5bc3d4fc576849cfd65..07f8ba961c1546ccac7ecaa5756c631afeae4b3e 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -345,11 +345,11 @@
   VAR1 (UNOP, float_extend_lo_, 0, v2df)
   VAR1 (UNOP, float_truncate_lo_, 0, v2sf)
 
-  /* Implemented by aarch64_ld1<VALL:mode>.  */
-  BUILTIN_VALL (LOAD1, ld1, 0)
+  /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
+  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
 
-  /* Implemented by aarch64_st1<VALL:mode>.  */
-  BUILTIN_VALL (STORE1, st1, 0)
+  /* Implemented by aarch64_st1<VALL_F16:mode>.  */
+  BUILTIN_VALL_F16 (STORE1, st1, 0)
 
   /* Implemented by fma<mode>4.  */
   BUILTIN_VDQF (TERNOP, fma, 4)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 8602b9ca0db5f38df8eedcc2c5e7aacf430dffb4..0deb799cbce1284d57e39a0b3fb719ef12f54d39 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL 0 "nonimmediate_operand" "")
-	(match_operand:VALL 1 "general_operand" ""))]
+  [(set (match_operand:VALL_F16 0 "nonimmediate_operand" "")
+	(match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
     if (GET_CODE (operands[0]) == MEM)
@@ -2397,7 +2397,7 @@
 (define_insn "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
 	(vec_select:<VEL>
-	  (match_operand:VALL 1 "register_operand" "w, w, w")
+	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
@@ -4036,8 +4036,9 @@
 )
 
 (define_insn "aarch64_be_ld1<mode>"
-  [(set (match_operand:VALLDI 0	"register_operand" "=w")
-	(unspec:VALLDI [(match_operand:VALLDI 1 "aarch64_simd_struct_operand" "Utv")]
+  [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
+	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
+			     "aarch64_simd_struct_operand" "Utv")]
 	UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%0<Vmtype>}, %1"
@@ -4045,8 +4046,8 @@
 )
 
 (define_insn "aarch64_be_st1<mode>"
-  [(set (match_operand:VALLDI 0 "aarch64_simd_struct_operand" "=Utv")
-	(unspec:VALLDI [(match_operand:VALLDI 1 "register_operand" "w")]
+  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand" "=Utv")
+	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1 "register_operand" "w")]
 	UNSPEC_ST1))]
   "TARGET_SIMD"
   "st1\\t{%1<Vmtype>}, %0"
@@ -4303,16 +4304,16 @@
   DONE;
 })
 
-(define_expand "aarch64_ld1<VALL:mode>"
- [(match_operand:VALL 0 "register_operand")
+(define_expand "aarch64_ld1<VALL_F16:mode>"
+ [(match_operand:VALL_F16 0 "register_operand")
   (match_operand:DI 1 "register_operand")]
   "TARGET_SIMD"
 {
-  machine_mode mode = <VALL:MODE>mode;
+  machine_mode mode = <VALL_F16:MODE>mode;
   rtx mem = gen_rtx_MEM (mode, operands[1]);
 
   if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_aarch64_be_ld1<VALL:mode> (operands[0], mem));
+    emit_insn (gen_aarch64_be_ld1<VALL_F16:mode> (operands[0], mem));
   else
     emit_move_insn (operands[0], mem);
   DONE;
@@ -4671,16 +4672,16 @@
   DONE;
 })
 
-(define_expand "aarch64_st1<VALL:mode>"
+(define_expand "aarch64_st1<VALL_F16:mode>"
  [(match_operand:DI 0 "register_operand")
-  (match_operand:VALL 1 "register_operand")]
+  (match_operand:VALL_F16 1 "register_operand")]
   "TARGET_SIMD"
 {
-  machine_mode mode = <VALL:MODE>mode;
+  machine_mode mode = <VALL_F16:MODE>mode;
   rtx mem = gen_rtx_MEM (mode, operands[0]);
 
   if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_aarch64_be_st1<VALL:mode> (mem, operands[1]));
+    emit_insn (gen_aarch64_be_st1<VALL_F16:mode> (mem, operands[1]));
   else
     emit_move_insn (mem, operands[1]);
   DONE;
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 7e3865af8b0207595c93393c16da18e8f798cd1d..ae7d44b3503698f16c8ead6b21a28aa5fd4a4543 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1003,6 +1003,9 @@ aarch64_split_simd_move (rtx dst, rtx src)
 	case V2DImode:
 	  gen = gen_aarch64_split_simd_movv2di;
 	  break;
+	case V8HFmode:
+	  gen = gen_aarch64_split_simd_movv8hf;
+	  break;
 	case V4SFmode:
 	  gen = gen_aarch64_split_simd_movv4sf;
 	  break;
@@ -7792,6 +7795,7 @@ aarch64_vector_mode_supported_p (machine_mode mode)
 	  || mode == V2SImode  || mode == V4HImode
 	  || mode == V8QImode || mode == V2SFmode
 	  || mode == V4SFmode || mode == V2DFmode
+	  || mode == V4HFmode || mode == V8HFmode
 	  || mode == V1DFmode))
     return true;
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 7e53374693b024bcfd9a93f966db63f0d72d0bce..4e8f38458ba9d97c368f6a521644c5fdabbdb2fc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -36,6 +36,7 @@ typedef __Int8x8_t int8x8_t;
 typedef __Int16x4_t int16x4_t;
 typedef __Int32x2_t int32x2_t;
 typedef __Int64x1_t int64x1_t;
+typedef __Float16x4_t float16x4_t;
 typedef __Float32x2_t float32x2_t;
 typedef __Poly8x8_t poly8x8_t;
 typedef __Poly16x4_t poly16x4_t;
@@ -48,6 +49,7 @@ typedef __Int8x16_t int8x16_t;
 typedef __Int16x8_t int16x8_t;
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
+typedef __Float16x8_t float16x8_t;
 typedef __Float32x4_t float32x4_t;
 typedef __Float64x2_t float64x2_t;
 typedef __Poly8x16_t poly8x16_t;
@@ -63,6 +65,7 @@ typedef __Poly16_t poly16_t;
 typedef __Poly64_t poly64_t;
 typedef __Poly128_t poly128_t;
 
+typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
@@ -2447,6 +2450,12 @@ vcreate_p16 (uint64_t __a)
 
 /* vget_lane  */
 
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vget_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vget_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -2521,6 +2530,12 @@ vget_lane_u64 (uint64x1_t __a, const int __b)
 
 /* vgetq_lane  */
 
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vgetq_lane_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vgetq_lane_f32 (float32x4_t __a, const int __b)
 {
@@ -4181,6 +4196,12 @@ vreinterpretq_u32_p16 (poly16x8_t __a)
 
 /* vset_lane  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vset_lane_f16 (float16_t __elem, float16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vset_lane_f32 (float32_t __elem, float32x2_t __vec, const int __index)
 {
@@ -4255,6 +4276,12 @@ vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
 
 /* vsetq_lane  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_f16 (float16_t __elem, float16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vsetq_lane_f32 (float32_t __elem, float32x4_t __vec, const int __index)
 {
@@ -15372,6 +15399,12 @@ vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
 
 /* vld1 */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_f16 (const float16_t *__a)
+{
+  return __builtin_aarch64_ld1v4hf (__a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_f32 (const float32_t *a)
 {
@@ -15451,6 +15484,12 @@ vld1_u64 (const uint64_t *a)
 
 /* vld1q */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_f16 (const float16_t *__a)
+{
+  return __builtin_aarch64_ld1v8hf (__a);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_f32 (const float32_t *a)
 {
@@ -15679,6 +15718,12 @@ vld1q_dup_u64 (const uint64_t* __a)
 
 /* vld1_lane  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
 {
@@ -15753,6 +15798,12 @@ vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
 
 /* vld1q_lane  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
 {
@@ -22416,6 +22467,12 @@ vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 /* vst1 */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f16 (float16_t *__a, float16x4_t __b)
+{
+  __builtin_aarch64_st1v4hf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_f32 (float32_t *a, float32x2_t b)
 {
   __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
@@ -22495,6 +22552,12 @@ vst1_u64 (uint64_t *a, uint64x1_t b)
 /* vst1q */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f16 (float16_t *__a, float16x8_t __b)
+{
+  __builtin_aarch64_st1v8hf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_f32 (float32_t *a, float32x4_t b)
 {
   __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
@@ -22575,6 +22638,12 @@ vst1q_u64 (uint64_t *a, uint64x2_t b)
 /* vst1_lane */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
 {
   *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -22649,6 +22718,12 @@ vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
 /* vst1q_lane */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
 {
   *__a = __aarch64_vget_lane_any (__b, __lane);
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 4056878735081317e6c29204bb491a576179afed..65f87c23430d6fe60addba5a8f7c831f897e9f17 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -58,7 +58,7 @@
 (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
 
 ;; Double vector modes.
-(define_mode_iterator VD [V8QI V4HI V2SI V2SF])
+(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
 ;; vector, 64-bit container, all integer modes
 (define_mode_iterator VD_BHSI [V8QI V4HI V2SI])
@@ -67,10 +67,10 @@
 (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
 
 ;; Quad vector modes.
-(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V4SF V2DF])
+(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
 ;; VQ without 2 element modes.
-(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V4SF])
+(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF])
 
 ;; Quad vector with only 2 element modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
@@ -113,12 +113,20 @@
 ;; Vector Float modes with 2 elements.
 (define_mode_iterator V2F [V2SF V2DF])
 
-;; All modes.
+;; All vector modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
-;; All vector modes and DI.
+;; All vector modes, including HF modes on which we cannot operate
+(define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				V4HF V8HF V2SF V4SF V2DF])
+
+;; All vector modes barring F16, plus DI.
 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])
 
+;; All vector modes and DI.
+(define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				  V4HF V8HF V2SF V4SF V2DF DI])
+
 ;; All vector modes and DI and DF.
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
 			       V2DI V2SF V4SF V2DF DI DF])
@@ -380,7 +388,8 @@
 (define_mode_attr Vmtype [(V8QI ".8b") (V16QI ".16b")
 			 (V4HI ".4h") (V8HI  ".8h")
 			 (V2SI ".2s") (V4SI  ".4s")
-			 (V2DI ".2d") (V2SF ".2s")
+			 (V2DI ".2d") (V4HF ".4h")
+			 (V8HF ".8h") (V2SF ".2s")
 			 (V4SF ".4s") (V2DF ".2d")
 			 (DI   "")    (SI   "")
 			 (HI   "")    (QI   "")
@@ -416,6 +425,7 @@
 			(V4HI "HI") (V8HI "HI")
                         (V2SI "SI") (V4SI "SI")
                         (DI "DI")   (V2DI "DI")
+                        (V4HF "HF") (V8HF "HF")
                         (V2SF "SF") (V4SF "SF")
                         (V2DF "DF") (DF "DF")
 			(SI   "SI") (HI   "HI")
@@ -434,6 +444,7 @@
 			 (V4HI "V8HI") (V8HI "V8HI")
 			 (V2SI "V4SI") (V4SI "V4SI")
 			 (DI   "V2DI") (V2DI "V2DI")
+			 (V4HF "V8HF") (V8HF "V8HF")
 			 (V2SF "V2SF") (V4SF "V4SF")
 			 (V2DF "V2DF") (SI   "V4SI")
 			 (HI   "V8HI") (QI   "V16QI")])
@@ -443,10 +454,12 @@
 			 (V4HI "V2HI")  (V8HI  "V4HI")
 			 (V2SI "SI")    (V4SI  "V2SI")
 			 (V2DI "DI")    (V2SF  "SF")
-			 (V4SF "V2SF")  (V2DF  "DF")])
+			 (V4SF "V2SF")  (V4HF "V2HF")
+			 (V8HF "V4HF")  (V2DF  "DF")])
 
 ;; Double modes of vector modes.
 (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
+			(V4HF "V8HF")
 			(V2SI "V4SI")  (V2SF "V4SF")
 			(SI   "V2SI")  (DI   "V2DI")
 			(DF   "V2DF")])
@@ -557,6 +570,7 @@
 (define_mode_attr nregs [(OI "2") (CI "3") (XI "4")])
 
 (define_mode_attr VRL2 [(V8QI "V32QI") (V4HI "V16HI")
+			(V4HF "V16HF")
 			(V2SI "V8SI")  (V2SF "V8SF")
 			(DI   "V4DI")  (DF   "V4DF")
 			(V16QI "V32QI") (V8HI "V16HI")
@@ -564,16 +578,20 @@
 			(V2DI "V4DI")  (V2DF "V4DF")])
 
 (define_mode_attr VRL3 [(V8QI "V48QI") (V4HI "V24HI")
+			(V4HF "V24HF")
 			(V2SI "V12SI")  (V2SF "V12SF")
 			(DI   "V6DI")  (DF   "V6DF")
 			(V16QI "V48QI") (V8HI "V24HI")
+			(V8HF "V48HF")
 			(V4SI "V12SI")  (V4SF "V12SF")
 			(V2DI "V6DI")  (V2DF "V6DF")])
 
 (define_mode_attr VRL4 [(V8QI "V64QI") (V4HI "V32HI")
+			(V4HF "V32HF")
 			(V2SI "V16SI")  (V2SF "V16SF")
 			(DI   "V8DI")  (DF   "V8DF")
 			(V16QI "V64QI") (V8HI "V32HI")
+			(V8HF "V32HF")
 			(V4SI "V16SI")  (V4SF "V16SF")
 			(V2DI "V8DI")  (V2DF "V8DF")])
 
@@ -586,6 +604,7 @@
                               (V2SI "V2SI") (V4SI "V2SI")
                               (DI "V2DI")   (V2DI "V2DI")
                               (V2SF "V2SF") (V4SF "V2SF")
+                              (V4HF "SF") (V8HF "SF")
                               (DF "V2DI")   (V2DF "V2DI")])
 
 ;; Similar, for three elements.
@@ -594,6 +613,7 @@
                                 (V2SI "BLK") (V4SI "BLK")
                                 (DI "EI")    (V2DI "EI")
                                 (V2SF "BLK") (V4SF "BLK")
+                                (V4HF "BLK") (V8HF "BLK")
                                 (DF "EI")    (V2DF "EI")])
 
 ;; Similar, for four elements.
@@ -602,6 +622,7 @@
                                (V2SI "V4SI") (V4SI "V4SI")
                                (DI "OI")     (V2DI "OI")
                                (V2SF "V4SF") (V4SF "V4SF")
+                               (V4HF "V4HF") (V8HF "V4HF")
                                (DF "OI")     (V2DF "OI")])
 
 
@@ -660,6 +681,7 @@
 		     (V4HI "") (V8HI  "_q")
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
+		     (V4HF "") (V8HF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (SF "") (DF "")])
diff --git a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
index 09a20dc985ef04314e3435b5eb899035429400c4..5740c0281b2fdf8bbc11d9428ca2f6ba8f1760a0 100644
--- a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+++ b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
@@ -13,6 +13,7 @@ void f3 (uint8x8_t a) {}
 void f4 (uint16x4_t a) {}
 void f5 (uint32x2_t a) {}
 void f23 (uint64x1_t a) {}
+void f61 (float16x4_t a) {}
 void f6 (float32x2_t a) {}
 void f7 (poly8x8_t a) {}
 void f8 (poly16x4_t a) {}
@@ -25,6 +26,7 @@ void f13 (uint8x16_t a) {}
 void f14 (uint16x8_t a) {}
 void f15 (uint32x4_t a) {}
 void f16 (uint64x2_t a) {}
+void f171 (float16x8_t a) {}
 void f17 (float32x4_t a) {}
 void f18 (float64x2_t a) {}
 void f19 (poly8x16_t a) {}
@@ -42,6 +44,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z2f412__Uint16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f512__Uint32x2_t:" } }
 // { dg-final { scan-assembler "_Z3f2312__Uint64x1_t:" } }
+// { dg-final { scan-assembler "_Z3f6113__Float16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f613__Float32x2_t:" } }
 // { dg-final { scan-assembler "_Z2f711__Poly8x8_t:" } }
 // { dg-final { scan-assembler "_Z2f812__Poly16x4_t:" } }
@@ -53,6 +56,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z3f1412__Uint16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1512__Uint32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1612__Uint64x2_t:" } }
+// { dg-final { scan-assembler "_Z4f17113__Float16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1713__Float32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1813__Float64x2_t:" } }
 // { dg-final { scan-assembler "_Z3f1912__Poly8x16_t:" } }
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c b/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c
index 290444e357f933ad2fe8160936c0d3aea3452fac..fa9ef0f4e438b45cd7f316b18ba462573fe0e035 100644
--- a/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c
@@ -31,6 +31,7 @@ THING (int8x8_t, 8, int8_t, _s8)	\
 THING (uint8x8_t, 8, uint8_t, _u8)	\
 THING (int16x4_t, 4, int16_t, _s16)	\
 THING (uint16x4_t, 4, uint16_t, _u16)	\
+THING (float16x4_t, 4, float16_t, _f16)	\
 THING (int32x2_t, 2, int32_t, _s32)	\
 THING (uint32x2_t, 2, uint32_t, _u32)	\
 THING (float32x2_t, 2, float32_t, _f32) \
@@ -38,8 +39,10 @@ THING (int8x16_t, 16, int8_t, q_s8)	\
 THING (uint8x16_t, 16, uint8_t, q_u8)	\
 THING (int16x8_t, 8, int16_t, q_s16)	\
 THING (uint16x8_t, 8, uint16_t, q_u16)	\
+THING (float16x8_t, 8, float16_t, q_f16)\
 THING (int32x4_t, 4, int32_t, q_s32)	\
 THING (uint32x4_t, 4, uint32_t, q_u32)	\
+THING (float32x4_t, 4, float32_t, q_f32)\
 THING (int64x2_t, 2, int64_t, q_s64)	\
 THING (uint64x2_t, 2, uint64_t, q_u64)	\
 THING (float64x2_t, 2, float64_t, q_f64)
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1_lane.c b/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
index c2445f8df53034027051722155a40161b86574bb..c70df7135c1f32714d87f0c21cae41650354ffb6 100644
--- a/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
@@ -16,6 +16,7 @@ VARIANT (int32, , 2, _s32, 0)	\
 VARIANT (int64, , 1, _s64, 0)	\
 VARIANT (poly8, , 8, _p8, 7)	\
 VARIANT (poly16, , 4, _p16, 2)	\
+VARIANT (float16, , 4, _f16, 3)	\
 VARIANT (float32, , 2, _f32, 1)	\
 VARIANT (float64, , 1, _f64, 0)	\
 VARIANT (uint8, q, 16, _u8, 13)	\
@@ -28,6 +29,7 @@ VARIANT (int32, q, 4, _s32, 1)	\
 VARIANT (int64, q, 2, _s64, 1)	\
 VARIANT (poly8, q, 16, _p8, 7)	\
 VARIANT (poly16, q, 8, _p16, 4)	\
+VARIANT (float16, q, 8, _f16, 3)\
 VARIANT (float32, q, 4, _f32, 2)\
 VARIANT (float64, q, 2, _f64, 1)
 
@@ -56,7 +58,7 @@ VARIANTS (TESTMETH)
 
 #define CHECK(BASE, Q, ELTS, SUFFIX, LANE)			\
   if (test_vld1##Q##_lane##SUFFIX ((const BASE##_t *)orig_data,	\
-				   BASE##_data) != 0)	\
+				   & BASE##_data) != 0)	\
     abort ();
 
 int
@@ -65,20 +67,20 @@ main (int argc, char **argv)
   /* Original data for all vector formats.  */
   uint64_t orig_data[2] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL};
 
-  /* Data with which vldN_lane will overwrite some of previous.  */
-  uint8_t uint8_data[4] = { 7, 11, 13, 17 };
-  uint16_t uint16_data[4] = { 257, 263, 269, 271 };
-  uint32_t uint32_data[4] = { 65537, 65539, 65543, 65551 };
-  uint64_t uint64_data[4] = { 0xdeadbeefcafebabeULL, 0x0123456789abcdefULL,
-			      0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
-  int8_t int8_data[4] = { -1, 3, -5, 7 };
-  int16_t int16_data[4] = { 257, -259, 261, -263 };
-  int32_t int32_data[4] = { 123456789, -987654321, -135792468, 975318642 };
-  int64_t *int64_data = (int64_t *)uint64_data;
-  poly8_t poly8_data[4] = { 0, 7, 13, 18, };
-  poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
-  float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
-  float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
+  /* Data with which vld1_lane will overwrite one element of previous.  */
+  uint8_t uint8_data = 7;
+  uint16_t uint16_data = 257;
+  uint32_t uint32_data = 65537;
+  uint64_t uint64_data = 0xdeadbeefcafebabeULL;
+  int8_t int8_data = -1;
+  int16_t int16_data = -259;
+  int32_t int32_data = -987654321;
+  int64_t int64_data = 0x1234567890abcdefLL;
+  poly8_t poly8_data = 13;
+  poly16_t poly16_data = 11111;
+  float16_t float16_data = 8.75;
+  float32_t float32_data = 3.14159;
+  float64_t float64_data = 1.010010001;
 
   VARIANTS (CHECK);
   return 0;
diff --git a/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c
index 5fb11399f202df7bc9a67c3d8ffb78f71c87e5c6..bc0132c20a7b8150b81491eaaf9b76ce448b2410 100644
--- a/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c
@@ -16,6 +16,7 @@ VARIANT (int32_t, , 2, int32x2_t, _s32, 0)	\
 VARIANT (int64_t, , 1, int64x1_t, _s64, 0)	\
 VARIANT (poly8_t, , 8, poly8x8_t, _p8, 6)	\
 VARIANT (poly16_t, , 4, poly16x4_t, _p16, 2)	\
+VARIANT (float16_t, , 4, float16x4_t, _f16, 3)	\
 VARIANT (float32_t, , 2, float32x2_t, _f32, 1)	\
 VARIANT (float64_t, , 1, float64x1_t, _f64, 0)	\
 VARIANT (uint8_t, q, 16, uint8x16_t, _u8, 11)	\
@@ -28,6 +29,7 @@ VARIANT (int32_t, q, 4, int32x4_t, _s32, 3)	\
 VARIANT (int64_t, q, 2, int64x2_t, _s64, 0)	\
 VARIANT (poly8_t, q, 16, poly8x16_t, _p8, 14)	\
 VARIANT (poly16_t, q, 8, poly16x8_t, _p16, 6)	\
+VARIANT (float16_t, q, 8, float16x8_t, _f16, 6)	\
 VARIANT (float32_t, q, 4, float32x4_t, _f32, 2) \
 VARIANT (float64_t, q, 2, float64x2_t, _f64, 1)
 
@@ -76,6 +78,9 @@ main (int argc, char **argv)
   poly8_t poly8_t_data[16] =
       { 0, 7, 13, 18, 22, 25, 27, 28, 29, 31, 34, 38, 43, 49, 56, 64 };
   poly16_t poly16_t_data[8] = { 11111, 2222, 333, 44, 5, 65432, 54321, 43210 };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
+
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 7/14][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (5 preceding siblings ...)
  2015-04-22 17:11 ` [PATCH 6/14][AArch64] Add support for float16x{4,8}_t vectors/builtins Alan Lawrence
@ 2015-04-22 17:14 ` Alan Lawrence
  2015-04-22 17:40   ` Alan Lawrence
  2015-04-22 17:16 ` [PATCH 8/14][AArch64]Add vreinterpret, float_truncate_lo/hi, vget_low/high Alan Lawrence
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:14 UTC (permalink / raw)
  To: gcc-patches

gcc/ChangeLog:

	* config/aarch64/aarch64.c (aarch64_split_simd_combine): Add V4HFmode.
	* config/aarch64/aarch64-builtins.c (VAR13, VAR14): New.
	(aarch64_scalar_builtin_types, aarch64_init_simd_builtin_scalar_types):
	Add __builtin_aarch64_simd_hf.
	* config/aarch64/arm_neon.h (float16x4x2_t, float16x8x2_t,
	float16x4x3_t, float16x8x3_t, float16x4x4_t, float16x8x4_t,
	vcombine_f16, vst2_lane_f16, vst2q_lane_f16, vst3_lane_f16,
	vst3q_lane_f16, vst4_lane_f16, vst4q_lane_f16, vld2_f16, vld2q_f16,
	vld3_f16, vld3q_f16, vld4_f16, vld4q_f16, vld2_dup_f16, vld2q_dup_f16,
	vld3_dup_f16, vld3q_dup_f16, vld4_dup_f16, vld4q_dup_f16,
	vld2_lane_f16, vld2q_lane_f16, vld3_lane_f16, vld3q_lane_f16,
	vld4_lane_f16, vld4q_lane_f16, vst2_f16, vst2q_f16, vst3_f16,
	vst3q_f16, vst4_f16, vst4q_f16, vcreate_f16): New.

	* config/aarch64/iterators.md (VALLDIF, Vtype, Vetype, Vbtype,
	V_cmp_result, v_cmp_result): Add cases for V4HF and V8HF.
	(VDC, Vdbl): Add V4HF.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vldN_1.c: Add float16x4_t and float16x8_t cases.
	* gcc.target/aarch64/vldN_dup_1.c: Likewise.
	* gcc.target/aarch64/vldN_lane_1.c: Likewise.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 8/14][AArch64]Add vreinterpret, float_truncate_lo/hi, vget_low/high
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (6 preceding siblings ...)
  2015-04-22 17:14 ` [PATCH 7/14][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate Alan Lawrence
@ 2015-04-22 17:16 ` Alan Lawrence
  2015-04-22 17:19 ` [PATCH 9/14][AArch64] vld1(q?)_dup, missing vreinterpretq intrinsics Alan Lawrence
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:16 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1642 bytes --]

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (aarch64_float_truncate_lo_v2sf):
	Reparameterize to...
	(aarch64_float_truncate_lo_<mode>): ...this, for both V2SF and V4HF.
	(aarch64_float_truncate_hi_v4sf): Reparameterize to...
	(aarch64_float_truncate_hi_<Vdbl>): ...this, for both V4SF and V8HF.

	* config/aarch64/aarch64-simd-builtins.def (float_truncate_hi_): Add
	v8hf variant.
	(float_truncate_lo_): Use BUILTIN_VDF iterator.

	* config/aarch64/arm_neon.h (vreinterpret_p8_f16, vreinterpret_p16_f16,
	vreinterpret_f16_f64, vreinterpret_f16_s8, vreinterpret_f16_s16,
	vreinterpret_f16_s32, vreinterpret_f16_s64, vreinterpret_f16_f32,
	vreinterpret_f16_u8, vreinterpret_f16_u16, vreinterpret_f16_u32,
	vreinterpret_f16_u64, vreinterpret_f16_p8, vreinterpret_f16_p16,
	vreinterpretq_f16_f64, vreinterpretq_f16_s8, vreinterpretq_f16_s16,
	vreinterpretq_f16_s32, vreinterpretq_f16_s64, vreinterpretq_f16_f32,
	vreinterpretq_f16_u8, vreinterpretq_f16_u16, vreinterpretq_f16_u32,
	vreinterpretq_f16_u64, vreinterpretq_f16_p8, vreinterpretq_f16_p16,
	vreinterpret_f32_f16, vreinterpret_f64_f16, vreinterpret_s64_f16,
	vreinterpret_u64_f16, vreinterpretq_u64_f16, vreinterpret_s8_f16,
	vreinterpret_s16_f16, vreinterpret_s32_f16, vreinterpret_u8_f16,
	vreinterpret_u16_f16, vreinterpret_u32_f16, vget_low_f16, vget_high_f16,
	vcvt_f16_f32, vcvt_high_f16_f32): New.

	* config/aarch64/iterators.md (VDF, Vdtype): New.
	(VWIDE, Vmwtype): Add cases for V4HF and V2SF.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vget_high_1.c: Add float16x8->float16x4 case.
	* gcc.target/aarch64/vget_low_1.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 08_aarch64_vreinterpret_lowhigh.patch --]
[-- Type: text/x-patch; name=08_aarch64_vreinterpret_lowhigh.patch, Size: 3605 bytes --]

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6298063d13c444d9b7c6cb0c14cfabce611f0d56..ea84055476c9e56e78d1b843e0b028e85a672ee6 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -4857,6 +4857,12 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
   uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
   return vreinterpret_##__TYPE##_u64 (lo);
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_low_f16 (float16x8_t __a)
+{
+  __GET_LOW (f16);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_low_f32 (float32x4_t __a)
 {
@@ -4936,6 +4942,12 @@ vget_low_u64 (uint64x2_t __a)
   uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
   return vreinterpret_##__TYPE##_u64 (hi);
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_high_f16 (float16x8_t __a)
+{
+  __GET_HIGH (f16);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_high_f32 (float32x4_t __a)
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_high_1.c b/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
index 4cb872da2cd269df5290a6af928ed958c4fecd09..b6b57e0c5468dbf571ec9e9196ac2d0fa3754d7a 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
@@ -14,6 +14,7 @@ VARIANT (int8_t, 8, int8x8_t, int8x16_t, s8)		\
 VARIANT (int16_t, 4, int16x4_t, int16x8_t, s16)		\
 VARIANT (int32_t, 2, int32x2_t, int32x4_t, s32)		\
 VARIANT (int64_t, 1, int64x1_t, int64x2_t, s64)		\
+VARIANT (float16_t, 4, float16x4_t, float16x8_t, f16)	\
 VARIANT (float32_t, 2, float32x2_t, float32x4_t, f32)	\
 VARIANT (float64_t, 1, float64x1_t, float64x2_t, f64)
 
@@ -51,6 +52,8 @@ main (int argc, char **argv)
   int16_t int16_t_data[8] = { -17, 19, 3, -999, 44048, 505, 9999, 1000};
   int32_t int32_t_data[4] = { 123456789, -987654321, -135792468, 975318642 };
   int64_t int64_t_data[2] = {0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_1.c b/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
index f8016ef73124981f7042957521f42754566e9518..2223676521c4c10b2d839746873eb559559d76ba 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
@@ -14,6 +14,7 @@ VARIANT (int8_t, 8, int8x8_t, int8x16_t, s8)		\
 VARIANT (int16_t, 4, int16x4_t, int16x8_t, s16)		\
 VARIANT (int32_t, 2, int32x2_t, int32x4_t, s32)		\
 VARIANT (int64_t, 1, int64x1_t, int64x2_t, s64)		\
+VARIANT (float16_t, 4, float16x4_t, float16x8_t, f16)	\
 VARIANT (float32_t, 2, float32x2_t, float32x4_t, f32)	\
 VARIANT (float64_t, 1, float64x1_t, float64x2_t, f64)
 
@@ -51,6 +52,8 @@ main (int argc, char **argv)
   int16_t int16_t_data[8] = { -17, 19, 3, -999, 44048, 505, 9999, 1000};
   int32_t int32_t_data[4] = { 123456789, -987654321, -135792468, 975318642 };
   int64_t int64_t_data[2] = {0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 9/14][AArch64] vld1(q?)_dup, missing vreinterpretq intrinsics
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (7 preceding siblings ...)
  2015-04-22 17:16 ` [PATCH 8/14][AArch64]Add vreinterpret, float_truncate_lo/hi, vget_low/high Alan Lawrence
@ 2015-04-22 17:19 ` Alan Lawrence
  2015-04-22 17:21 ` [PATCH 10/14][AArch64] Add vcvt(_high)?_f32_f16 intrinsics Alan Lawrence
                   ` (5 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:19 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 342 bytes --]

gcc/ChangeLog:

	* config/aarch64/arm_neon.h (vreinterpretq_p8_f16,
	vreinterpretq_p16_f16, vreinterpretq_f32_f16, vreinterpretq_f64_f16,
	vreinterpretq_s64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16,
	vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16,
	vreinterpretq_u32_f16, vld1_dup_f16, vld1q_dup_f16): New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 09_aarch64_vreinterpretq_vld1_dup.patch --]
[-- Type: text/x-patch; name=09_aarch64_vreinterpretq_vld1_dup.patch, Size: 5292 bytes --]

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 88723231e5c32faf3bc68eccdf4e3a2b104b57b9..6d98b2e08221c3e25c4f66e6058b4f228d90a094 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -2993,6 +2993,12 @@ vreinterpretq_p8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f16 (float16x8_t __a)
+{
+  return (poly8x16_t) __a;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_p8_f32 (float32x4_t __a)
 {
   return (poly8x16_t) __a;
@@ -3131,6 +3137,12 @@ vreinterpretq_p16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_f16 (float16x8_t __a)
+{
+  return (poly16x8_t) __a;
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_p16_f32 (float32x4_t __a)
 {
   return (poly16x8_t) __a;
@@ -3383,6 +3395,12 @@ vreinterpret_f32_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_f16 (float16x8_t __a)
+{
+  return (float32x4_t) __a;
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_f64 (float64x2_t __a)
 {
   return (float32x4_t) __a;
@@ -3521,6 +3539,12 @@ vreinterpret_f64_u64 (uint64x1_t __a)
 }
 
 __extension__ static __inline float64x2_t __attribute__((__always_inline__))
+vreinterpretq_f64_f16 (float16x8_t __a)
+{
+  return (float64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__((__always_inline__))
 vreinterpretq_f64_f32 (float32x4_t __a)
 {
   return (float64x2_t) __a;
@@ -3683,6 +3707,12 @@ vreinterpretq_s64_s32 (int32x4_t __a)
 }
 
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f16 (float16x8_t __a)
+{
+  return (int64x2_t) __a;
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_s64_f32 (float32x4_t __a)
 {
   return (int64x2_t) __a;
@@ -3965,6 +3995,12 @@ vreinterpretq_s8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f16 (float16x8_t __a)
+{
+  return (int8x16_t) __a;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_s8_f32 (float32x4_t __a)
 {
   return (int8x16_t) __a;
@@ -4103,6 +4139,12 @@ vreinterpretq_s16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f16 (float16x8_t __a)
+{
+  return (int16x8_t) __a;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_s16_f32 (float32x4_t __a)
 {
   return (int16x8_t) __a;
@@ -4241,6 +4283,12 @@ vreinterpretq_s32_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f16 (float16x8_t __a)
+{
+  return (int32x4_t) __a;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t) __a;
@@ -4385,6 +4433,12 @@ vreinterpretq_u8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f16 (float16x8_t __a)
+{
+  return (uint8x16_t) __a;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_u8_f32 (float32x4_t __a)
 {
   return (uint8x16_t) __a;
@@ -4523,6 +4577,12 @@ vreinterpretq_u16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t) __a;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_u16_f32 (float32x4_t __a)
 {
   return (uint16x8_t) __a;
@@ -4661,6 +4721,12 @@ vreinterpretq_u32_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f16 (float16x8_t __a)
+{
+  return (uint32x4_t) __a;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t) __a;
@@ -15107,6 +15173,13 @@ vld1q_u64 (const uint64_t *a)
 
 /* vld1_dup  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_dup_f16 (const float16_t* __a)
+{
+  float16_t __f = *__a;
+  return (float16x4_t) { __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_dup_f32 (const float32_t* __a)
 {
@@ -15181,6 +15254,13 @@ vld1_dup_u64 (const uint64_t* __a)
 
 /* vld1q_dup  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_f16 (const float16_t* __a)
+{
+  float16_t __f = *__a;
+  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t* __a)
 {

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 10/14][AArch64] Add vcvt(_high)?_f32_f16 intrinsics
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (8 preceding siblings ...)
  2015-04-22 17:19 ` [PATCH 9/14][AArch64] vld1(q?)_dup, missing vreinterpretq intrinsics Alan Lawrence
@ 2015-04-22 17:21 ` Alan Lawrence
  2015-04-22 17:31 ` [PATCH 11/14][fold-const.c] Fix bigendian HFmode in native_interpret_real Alan Lawrence
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:21 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3389 bytes --]

This adds the two remaining widening intrinsics, first adding patterns in 
aarch64-simd.md, then entries in aarch64-simd-builtins.def, and finally 
intrinsics in arm_neon.h .

Note this changes the vector indices present in the RTL on bigendian for float 
vec_unpacks, to be the same as for integer vec_unpacks. This appears consistent 
with the usage of VEC_UNPACK_(FLOAT_)?EXPR in tree-vect-stmts.c, which uses a 
different EXPR for the same half of the vector depending on endianness. I was 
not able to construct a testcase where the RTL here mattered (i.e. where the RTL 
was constant-folded, but the tree had not been), but the correctness can be seen 
from a testcase:

double d[4];
void
bar (float *f)
{
   for (int i = 0; i < 4; i++)
     d[i] = f[i];
}

which used to produced as final RTL (-O3)

(insn:TI 8 10 12 (set (reg:V2DF 33 v1 [orig:78 vect__9.19 ] [78])
         (float_extend:V2DF (vec_select:V2SF (reg:V4SF 32 v0 [orig:77 MEM[(float 
*)f_6(D)] ] [77])
                 (parallel [
                         (const_int 2 [0x2])
                         (const_int 3 [0x3])
                     ])))) test.c:40 1274 {vec_unpacks_hi_v4sf}
      (expr_list:REG_EQUIV (mem/c:V2DF (reg/f:DI 0 x0 [79]) [2 MEM[(double 
*)&d]+0 S16 A64])
         (nil)))
(insn:TI 12 8 11 (set (reg:V2DF 32 v0 [orig:81 vect__9.19 ] [81])
         (float_extend:V2DF (vec_select:V2SF (reg:V4SF 32 v0 [orig:77 MEM[(float 
*)f_6(D)] ] [77])
                 (parallel [
                         (const_int 0 [0])
                         (const_int 1 [0x1])
                     ])))) test.c:40 1272 {vec_unpacks_lo_v4sf}
      (expr_list:REG_EQUIV (mem/c:V2DF (plus:DI (reg/f:DI 0 x0 [79])
                 (const_int 16 [0x10])) [2 MEM[(double *)&d + 16B]+0 S16 A64])
         (nil)))
(insn:TI 11 12 15 (set (mem/c:V2DF (reg/f:DI 0 x0 [79]) [2 MEM[(double *)&d]+0 
S16 A64])
         (reg:V2DF 33 v1 [orig:78 vect__9.19 ] [78])) test.c:40 808 
{*aarch64_simd_movv2df}
      (expr_list:REG_DEAD (reg:V2DF 33 v1 [orig:78 vect__9.19 ] [78])
         (nil)))
(insn:TI 15 11 22 (set (mem/c:V2DF (plus:DI (reg/f:DI 0 x0 [79])
                 (const_int 16 [0x10])) [2 MEM[(double *)&d + 16B]+0 S16 A64])
         (reg:V2DF 32 v0 [orig:81 vect__9.19 ] [81])) test.c:40 808 
{*aarch64_simd_movv2df}
      (expr_list:REG_DEAD (reg:V2DF 32 v0 [orig:81 vect__9.19 ] [81])

i.e. apparently storing vector elements 2 and 3 to the address of d, and elems 
0+1 to address (d+16). Of course this was flipped back again to be correct at 
assembly time, but following this patch the RTL indices are also correct (elems 
0+1 to address d, elems 2+3 to address d+16).

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (aarch64_simd_vec_unpacks_lo_<mode>,
	aarch64_simd_vec_unpacks_hi_<mode>): New insn.
	(vec_unpacks_lo_v4sf, vec_unpacks_hi_v4sf): Delete insn.
	(vec_unpacks_lo_<mode>, vec_unpacks_hi_<mode>): New expand.
	(aarch64_float_extend_lo_v2df): Rename to...
	(aarch64_float_extend_lo_<Vwide>): this, using VDF and so adding V4SF.

	* config/aarch64/aarch64-simd-builtins.def (vec_unpacks_hi): Add v8hf.
	(float_extend_lo): Add v4sf.

	* config/aarch64/arm_neon.h (vcvt_f32_f16, vcvt_high_f32_f16): New.
	* config/aarch64/iterators.md (VQ_HSF): New iterator.
	(VWIDE, Vwtype, Vhalftype): Add V8HF, V4SF.
	(Vwide): New mode_attr.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 10_aarch64_vcvt_high_f32_f16.patch --]
[-- Type: text/x-patch; name=10_aarch64_vcvt_high_f32_f16.patch, Size: 7064 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 604bfa20bf838ee04ef0e1dda0b47b55dbdd82a6..1eefb37c2eba37aecee6ccae100274c5a8cc5ae3 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -360,11 +360,11 @@
      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
   BUILTIN_VALLDI (UNOP, abs, 2)
 
-  VAR1 (UNOP, vec_unpacks_hi_, 10, v4sf)
+  VAR2 (UNOP, vec_unpacks_hi_, 10, v4sf, v8hf)
   VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
   VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
   
-  VAR1 (UNOP, float_extend_lo_, 0, v2df)
+  VAR2 (UNOP, float_extend_lo_, 0, v2df, v4sf)
   BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
   
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 161396a331cab777bb2108f86c39b74557be4abc..17a5d5f8c757833a7ed387083f8076af2c8cad66 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1677,36 +1677,57 @@
 
 ;; Float widening operations.
 
-(define_insn "vec_unpacks_lo_v4sf"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-	(float_extend:V2DF
-	  (vec_select:V2SF
-	    (match_operand:V4SF 1 "register_operand" "w")
-	    (parallel [(const_int 0) (const_int 1)])
-	  )))]
+(define_insn "aarch64_simd_vec_unpacks_lo_<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (float_extend:<VWIDE> (vec_select:<VHALF>
+			       (match_operand:VQ_HSF 1 "register_operand" "w")
+			       (match_operand:VQ_HSF 2 "vect_par_cnst_lo_half" "")
+			    )))]
   "TARGET_SIMD"
-  "fcvtl\\t%0.2d, %1.2s"
+  "fcvtl\\t%0.<Vwtype>, %1.<Vhalftype>"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
-(define_insn "aarch64_float_extend_lo_v2df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-	(float_extend:V2DF
-	  (match_operand:V2SF 1 "register_operand" "w")))]
+(define_expand "vec_unpacks_lo_<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (match_operand:VQ_HSF 1 "register_operand" "")]
   "TARGET_SIMD"
-  "fcvtl\\t%0.2d, %1.2s"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
+						       operands[1], p));
+    DONE;
+  }
+)
+
+(define_insn "aarch64_simd_vec_unpacks_hi_<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (float_extend:<VWIDE> (vec_select:<VHALF>
+			       (match_operand:VQ_HSF 1 "register_operand" "w")
+			       (match_operand:VQ_HSF 2 "vect_par_cnst_hi_half" "")
+			    )))]
+  "TARGET_SIMD"
+  "fcvtl2\\t%0.<Vwtype>, %1.<Vtype>"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
-(define_insn "vec_unpacks_hi_v4sf"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-	(float_extend:V2DF
-	  (vec_select:V2SF
-	    (match_operand:V4SF 1 "register_operand" "w")
-	    (parallel [(const_int 2) (const_int 3)])
-	  )))]
+(define_expand "vec_unpacks_hi_<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (match_operand:VQ_HSF 1 "register_operand" "")]
+  "TARGET_SIMD"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+    emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
+						       operands[1], p));
+    DONE;
+  }
+)
+(define_insn "aarch64_float_extend_lo_<Vwide>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(float_extend:<VWIDE>
+	  (match_operand:VDF 1 "register_operand" "w")))]
   "TARGET_SIMD"
-  "fcvtl2\\t%0.2d, %1.4s"
+  "fcvtl\\t%0<Vmwtype>, %1<Vmtype>"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 424e2807ca80ea152df94cccba0bf36c7f443c2c..ecfabd2b1282f48150ef5193b55c6ba2b9cbfce3 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6026,10 +6026,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-/* vcvt_f32_f16 not supported */
-
-/* vcvt_high_f32_f16 not supported */
-
 #define vcvt_n_f32_s32(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
@@ -13420,6 +13416,12 @@ vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
 
 /* vcvt (float -> double).  */
 
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvt_f32_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_float_extend_lo_v4sf (__a);
+}
+
 __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vcvt_f64_f32 (float32x2_t __a)
 {
@@ -13427,6 +13429,12 @@ vcvt_f64_f32 (float32x2_t __a)
   return __builtin_aarch64_float_extend_lo_v2df (__a);
 }
 
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvt_high_f32_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
+}
+
 __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vcvt_high_f64_f32 (float32x4_t __a)
 {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index ad043931705d10cccaaeb2a74ad8320b0305b435..d3207d3a41d00aba4e67b4319c0d5a0edbf602b6 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -91,6 +91,9 @@
 ;; Vector single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
 
+;; Quad vector Float modes with half/single elements.
+(define_mode_iterator VQ_HSF [V8HF V4SF])
+
 ;; Modes suitable to use as the return type of a vcond expression.
 (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
 
@@ -490,14 +493,18 @@
 			 (V2SI "V2DI") (V16QI "V8HI") 
 			 (V8HI "V4SI") (V4SI "V2DI")
 			 (HI "SI")     (SI "DI")
+			 (V8HF "V4SF") (V4SF "V2DF")
 			 (V4HF "V4SF") (V2SF "V2DF")]
-
 )
 
-;; Widened mode register suffixes for VD_BHSI/VQW.
+;; Widened modes of vector modes, lowercase
+(define_mode_attr Vwide [(V2SF "v2df") (V4HF "v4sf")])
+
+;; Widened mode register suffixes for VD_BHSI/VQW/VQ_HSF.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
-			  (V8HI "4s") (V4SI "2d")])
+			  (V8HI "4s") (V4SI "2d")
+			  (V8HF "4s") (V4SF "2d")])
 
 ;; Widened mode register suffixes for VDW/VQW.
 (define_mode_attr Vmwtype [(V8QI ".8h") (V4HI ".4s")
@@ -506,9 +513,10 @@
 			   (V4HF ".4s") (V2SF ".2d")
 			   (SI   "")    (HI   "")])
 
-;; Lower part register suffixes for VQW.
+;; Lower part register suffixes for VQW/VQ_HSF.
 (define_mode_attr Vhalftype [(V16QI "8b") (V8HI "4h")
-			     (V4SI "2s")])
+			     (V4SI "2s") (V8HF "4h")
+			     (V4SF "2s")])
 
 ;; Define corresponding core/FP element mode for each vector mode.
 (define_mode_attr vw   [(V8QI "w") (V16QI "w")

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 12/14][ARM/AArch64 Testsuite] Update advsimd-intrinsics tests to add float16 vectors
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (10 preceding siblings ...)
  2015-04-22 17:31 ` [PATCH 11/14][fold-const.c] Fix bigendian HFmode in native_interpret_real Alan Lawrence
@ 2015-04-22 17:31 ` Alan Lawrence
  2015-05-25 12:37   ` Christophe Lyon
  2015-04-22 17:36 ` [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp Alan Lawrence
                   ` (2 subsequent siblings)
  14 siblings, 1 reply; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:31 UTC (permalink / raw)
  To: gcc-patches; +Cc: Christophe Lyon

[-- Attachment #1: Type: text/plain, Size: 2623 bytes --]

This is a fairly straightforward addition of a new type: I've added it in on 
equal status to the other types, because the various 
vector-load/store/element-manipulating intrinsics, are *not* conditional on HW 
support. (They just involve moving 16-bit chunks around, just like s16/u16/p16).

Thus, for many tests, this just involves adding default "expected" values of { 
0x3333 ...}. While there are indeed more of such "default" values for 
float16x4/8 than any other type (because there are fewer intrinsics), there are 
plenty others, so this seems consistent. However, there is no vdup_n_f16 
intrinsic so I worked around this using a macro (yes, a bit ugh).

There are many check_GNU_style.sh violations here but I tried to be consistent 
with the existing code.

Passing on arm-none-linux-gnueabihf and aarch64-none-linux-gnu, 
aarch64_be-none-elf (following previous patch)

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h (hfloat16_t,
	vdup_n_f16): New.
	(result, expected, CHECK_RESULTS, CHECK_RESULTS_NAMED, clean_results):
	Add float16x4 and float16x8 cases.

	DECL_VARIABLE_64BITS_VARIANTS: Add float16x4 case.
	DECL_VARIABLE_128BITS_VARIANTS: Add float16x8 case.

	* gcc.target/aarch64/advsimd-intrinsics/compute-data-ref.h (buffer,
	buffer_pad, buffer_dup, buffer_dup_pad): Add float16x4 and float16x8.

	* gcc.target/aarch64/advsimd-intrinsics/vaba.c: Add expected results
	for float16x4 and float16x8.
	* gcc.target/aarch64/advsimd-intrinsics/vabal.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vabd.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vabdl.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vabs.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vadd.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vaddl.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vaddw.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vand.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vbic.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vbsl.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vcls.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vclz.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vcnt.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c: Likewise.

	* gcc.target/aarch64/advsimd-intrinsics/vcombine.c: Add expected
	results for float16x4 and float16x8.
	(main): add test of float16x4 -> float16x8 case.
	* gcc.target/aarch64/advsimd-intrinsics/vcreate.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 12_advsimd_float16.patch --]
[-- Type: text/x-patch; name=12_advsimd_float16.patch, Size: 118145 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 80d6b5893cc33eaff4178a2f26aa53ccf1c48dda..3d1b36fd111e906f6e940ccf89900b44e79a68e9 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -7,6 +7,7 @@
 #include <inttypes.h>
 
 /* helper type, to help write floating point results in integer form.  */
+typedef uint16_t hfloat16_t;
 typedef uint32_t hfloat32_t;
 typedef uint64_t hfloat64_t;
 
@@ -132,6 +133,7 @@ static ARRAY(result, uint, 32, 2);
 static ARRAY(result, uint, 64, 1);
 static ARRAY(result, poly, 8, 8);
 static ARRAY(result, poly, 16, 4);
+static ARRAY(result, float, 16, 4);
 static ARRAY(result, float, 32, 2);
 static ARRAY(result, int, 8, 16);
 static ARRAY(result, int, 16, 8);
@@ -143,6 +145,7 @@ static ARRAY(result, uint, 32, 4);
 static ARRAY(result, uint, 64, 2);
 static ARRAY(result, poly, 8, 16);
 static ARRAY(result, poly, 16, 8);
+static ARRAY(result, float, 16, 8);
 static ARRAY(result, float, 32, 4);
 #ifdef __aarch64__
 static ARRAY(result, float, 64, 2);
@@ -160,6 +163,7 @@ extern ARRAY(expected, uint, 32, 2);
 extern ARRAY(expected, uint, 64, 1);
 extern ARRAY(expected, poly, 8, 8);
 extern ARRAY(expected, poly, 16, 4);
+extern ARRAY(expected, hfloat, 16, 4);
 extern ARRAY(expected, hfloat, 32, 2);
 extern ARRAY(expected, int, 8, 16);
 extern ARRAY(expected, int, 16, 8);
@@ -171,6 +175,7 @@ extern ARRAY(expected, uint, 32, 4);
 extern ARRAY(expected, uint, 64, 2);
 extern ARRAY(expected, poly, 8, 16);
 extern ARRAY(expected, poly, 16, 8);
+extern ARRAY(expected, hfloat, 16, 8);
 extern ARRAY(expected, hfloat, 32, 4);
 extern ARRAY(expected, hfloat, 64, 2);
 
@@ -187,6 +192,7 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 1, PRIx64, expected, comment);		\
     CHECK(test_name, poly, 8, 8, PRIx8, expected, comment);		\
     CHECK(test_name, poly, 16, 4, PRIx16, expected, comment);		\
+    CHECK_FP(test_name, float, 16, 4, PRIx16, expected, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, expected, comment);		\
@@ -199,6 +205,7 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);		\
     CHECK(test_name, poly, 8, 16, PRIx8, expected, comment);		\
     CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);		\
+    CHECK_FP(test_name, float, 16, 8, PRIx16, expected, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
   }									\
 
@@ -214,6 +221,7 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_FP(test_name, float, 16, 4, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -226,6 +234,7 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 2, PRIx64, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_FP(test_name, float, 16, 8, PRIx16, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
   }									\
 
@@ -374,6 +383,7 @@ static void clean_results (void)
   CLEAN(result, uint, 64, 1);
   CLEAN(result, poly, 8, 8);
   CLEAN(result, poly, 16, 4);
+  CLEAN(result, float, 16, 4);
   CLEAN(result, float, 32, 2);
 
   CLEAN(result, int, 8, 16);
@@ -386,6 +396,7 @@ static void clean_results (void)
   CLEAN(result, uint, 64, 2);
   CLEAN(result, poly, 8, 16);
   CLEAN(result, poly, 16, 8);
+  CLEAN(result, float, 16, 8);
   CLEAN(result, float, 32, 4);
 }
 
@@ -428,6 +439,7 @@ static void clean_results (void)
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
+  DECL_VARIABLE(VAR, float, 16, 4);		\
   DECL_VARIABLE(VAR, float, 32, 2)
 
 /* Declare all 128 bits variants.  */
@@ -436,6 +448,7 @@ static void clean_results (void)
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
+  DECL_VARIABLE(VAR, float, 16, 8);		\
   DECL_VARIABLE(VAR, float, 32, 4)
 
 /* Declare all variants.  */
@@ -456,6 +469,13 @@ static void clean_results (void)
 /* Helpers to initialize vectors.  */
 #define VDUP(VAR, Q, T1, T2, W, N, V)			\
   VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
+/* Work around that there is no vdup_n_f16 intrinsic.  */
+#define vdup_n_f16(VAL)		\
+  __extension__			\
+    ({				\
+      float16_t f = VAL;	\
+      vld1_dup_f16(&f);		\
+    })
 
 #define VSET_LANE(VAR, Q, T1, T2, W, N, L, V)				\
   VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
index 26203cc0a69372e6a5d826f3c2b6663a65686a37..6365579389b659864d189132632fade41e026a18 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
@@ -118,6 +118,8 @@ VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
 PAD(buffer_pad, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
 PAD(buffer_pad, uint, 64, 1);
+VECT_VAR_DECL_INIT(buffer, float, 16, 4);
+PAD(buffer_pad, float, 16, 4);
 VECT_VAR_DECL_INIT(buffer, float, 32, 2);
 PAD(buffer_pad, float, 32, 2);
 VECT_VAR_DECL_INIT(buffer, int, 8, 16);
@@ -140,6 +142,8 @@ VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
 PAD(buffer_pad, poly, 8, 16);
 VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
 PAD(buffer_pad, poly, 16, 8);
+VECT_VAR_DECL_INIT(buffer, float, 16, 8);
+PAD(buffer_pad, float, 16, 8);
 VECT_VAR_DECL_INIT(buffer, float, 32, 4);
 PAD(buffer_pad, float, 32, 4);
 #ifdef __aarch64__
@@ -170,6 +174,8 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
+VECT_VAR_DECL_INIT4(buffer_dup, float, 16, 4);
+VECT_VAR_DECL(buffer_dup_pad, float, 16, 4);
 VECT_VAR_DECL_INIT4(buffer_dup, float, 32, 2);
 VECT_VAR_DECL(buffer_dup_pad, float, 32, 2);
 
@@ -193,5 +199,7 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
+VECT_VAR_DECL_INIT(buffer_dup, float, 16, 8);
+VECT_VAR_DECL(buffer_dup_pad, float, 16, 8);
 VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4);
 VECT_VAR_DECL(buffer_dup_pad, float, 32, 4);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c
index 2465cd24416c4c4bc6efa0497648ced2aef41041..d4a356683c42872771cabe16bdf3fa0d43fc65e7 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x5e, 0x5f, 0x60, 0x61,
 					0x62, 0x63, 0x64, 0x65,
@@ -41,6 +42,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabal.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabal.c
index cd310623463fd41f51ec8df739698ba61374f0cf..b4719166019c24f6aa53d8452f5792e4f926021a 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabal.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabal.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -40,6 +41,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c
index e95404f67892e25af3632908a7b50cccd0650bef..43839f64dab366341f8252b705ea5c3093189361 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x41c26666, 0x41ba6666 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x1a, 0x19, 0x18, 0x17,
 					0x16, 0x15, 0x14, 0x13,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x42407ae1, 0x423c7ae1,
 					   0x42387ae1, 0x42347ae1 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabdl.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabdl.c
index 28018ab0aed3eba3c46f4c0ac5c0043974bf1355..2d216e325f304169d7d0761f6e49f29459752a1c 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabdl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabdl.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -40,6 +41,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c
index ca3901abb137c2d8f747d3a4bf73bccdbc282094..e1c65e1b64f156ec27f357005462644d18612366 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c
@@ -21,6 +21,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x10, 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9,
 					0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1 };
@@ -45,6 +46,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					  0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c
index f08c620c82c228680d76f58bd6d3c67a4a653bf1..7617ee769a844d131fd7f42706273e8794768342 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c
@@ -21,6 +21,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xe6, 0xe7, 0xe8, 0xe9,
 					0xea, 0xeb, 0xec, 0xed,
@@ -46,6 +47,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddl.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddl.c
index 020d9f84a1b081c3cd1a3aca977328cdfb5d4004..4bbaecdfe2a8c69decd5d2b28efd3d54a051bd0a 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddl.c
@@ -19,6 +19,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -45,6 +46,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddw.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddw.c
index 27f54f635940f6da4112527a1869add13fe611d6..734947ceb3e1edc4daf381e0bd8b54a0810ff5e1 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddw.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddw.c
@@ -19,6 +19,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -45,6 +46,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vand.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vand.c
index e7e65ddde65cce9b062bd0fda6c05891309b847d..ffddd179efc85ed58dea6d36a63b39b195ffff75 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vand.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vand.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf0, 0xf2, 0xf2,
 					0xf4, 0xf4, 0xf6, 0xf6,
@@ -41,5 +42,7 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbic.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbic.c
index 83e57ff2d610697bc41ffcc98209927a24bb6422..5d7a8128c9e4586737360ce8257e27517731cfea 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbic.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbic.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x0, 0x1, 0x0, 0x1,
 					0x0, 0x1, 0x0, 0x1,
@@ -42,5 +43,7 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
index bb17f0a9649beedaf0c054679b3efc1a86463c35..6027f3791df3cb540980ca92bff540fd1c927bf4 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffff1 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 					0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800004, 0xc1700004 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					0xf6, 0xf6, 0xf6, 0xf6,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
 					 0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2,
 					 0xfff4, 0xfff4, 0xfff6, 0xfff6 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800001, 0xc1700001,
 					   0xc1600001, 0xc1500001 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcls.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcls.c
index 1d56c126f2205df765acd9462443daf746514507..f5d3978167fc9001fa7eecc851d761bce27c1b00 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcls.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcls.c
@@ -15,6 +15,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x7, 0x7, 0x7, 0x7,
 					0x7, 0x7, 0x7, 0x7,
@@ -41,6 +42,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -60,6 +63,8 @@ VECT_VAR_DECL(expected_with_negative,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 						      0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_with_negative,poly,16,4) [] = { 0x3333, 0x3333,
 						       0x3333, 0x3333 };
+VECT_VAR_DECL(expected_with_negative,hfloat,16,4) [] = { 0x3333, 0x3333,
+						         0x3333, 0x3333 };
 VECT_VAR_DECL(expected_with_negative,hfloat,32,2) [] = { 0x33333333,
 							 0x33333333 };
 VECT_VAR_DECL(expected_with_negative,int,8,16) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -91,6 +96,10 @@ VECT_VAR_DECL(expected_with_negative,poly,16,8) [] = { 0x3333, 0x3333,
 						       0x3333, 0x3333,
 						       0x3333, 0x3333,
 						       0x3333, 0x3333 };
+VECT_VAR_DECL(expected_with_negative,hfloat,16,8) [] = { 0x3333, 0x3333,
+						         0x3333, 0x3333,
+						         0x3333, 0x3333,
+						         0x3333, 0x3333 };
 VECT_VAR_DECL(expected_with_negative,hfloat,32,4) [] = { 0x33333333,
 							 0x33333333,
 							 0x33333333,
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclz.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclz.c
index ad28d2dd8219756a0202aff570eccc3cacc16003..0c496570da7126ebdc7740ac18b06ac7db36da56 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclz.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclz.c
@@ -14,6 +14,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
 					0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2 };
@@ -34,6 +35,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -52,6 +55,7 @@ VECT_VAR_DECL(expected_with_0,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_with_0,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_with_0,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_with_0,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_with_0,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected_with_0,int,8,16) [] = { 0x8, 0x8, 0x8, 0x8,
 					       0x8, 0x8, 0x8, 0x8,
@@ -77,6 +81,8 @@ VECT_VAR_DECL(expected_with_0,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_with_0,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_with_0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_with_0,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
index 4d9f0ced9ac5523ef1a327a417df999302a4433b..69750ed70aab792f8a638b8499cc95ece755bce0 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
@@ -13,6 +13,7 @@ VECT_VAR_DECL(expected,uint,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 					0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
@@ -33,6 +34,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6,
 					 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
index 6d5a854937d7fea5c8a587c2b8c6f1aba6b3faa9..d3602907357ebf390653c979754078c8ac6058f8 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7,
@@ -41,7 +42,8 @@ VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0x66, 0x66, 0x66, 0x66 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0x40533333, 0x40533333 };
-
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+					   0x4080, 0x4080, 0x4080, 0x4080 };
 #define TEST_MSG "VCOMBINE"
 void exec_vcombine (void)
 {
@@ -58,6 +60,7 @@ void exec_vcombine (void)
 
   /* Initialize input "vector64_a" from "buffer".  */
   TEST_MACRO_64BITS_VARIANTS_2_5(VLOAD, vector64_a, buffer);
+  VLOAD(vector64_a, buffer, , float, f, 16, 4);
   VLOAD(vector64_a, buffer, , float, f, 32, 2);
 
   /* Choose init value arbitrarily.  */
@@ -71,8 +74,8 @@ void exec_vcombine (void)
   VDUP(vector64_b, , uint, u, 64, 1, 0x88);
   VDUP(vector64_b, , poly, p, 8, 8, 0x55);
   VDUP(vector64_b, , poly, p, 16, 4, 0x66);
+  VDUP(vector64_b, , float, f, 16, 4, 2.25);
   VDUP(vector64_b, , float, f, 32, 2, 3.3f);
-
   clean_results ();
 
   /* Execute the tests.  */
@@ -86,6 +89,7 @@ void exec_vcombine (void)
   TEST_VCOMBINE(uint, u, 64, 1, 2);
   TEST_VCOMBINE(poly, p, 8, 8, 16);
   TEST_VCOMBINE(poly, p, 16, 4, 8);
+  TEST_VCOMBINE(float, f, 16, 4, 8);
   TEST_VCOMBINE(float, f, 32, 2, 4);
 
   CHECK_RESULTS (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
index 686358df455cf7f4bdd3431202146eaf56a766b3..17e84b460af9c5c6b2a6ba20b1f82fca1f134ceb 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x123456789abcdef0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xde, 0xbc, 0x9a,
 					0x78, 0x56, 0x34, 0x12 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x9abcdef0, 0x12345678 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -66,6 +69,7 @@ FNNAME (INSN_NAME)
   DECL_VAL(val, int, 16, 4);
   DECL_VAL(val, int, 32, 2);
   DECL_VAL(val, int, 64, 1);
+  DECL_VAL(val, float, 16, 4);
   DECL_VAL(val, float, 32, 2);
   DECL_VAL(val, uint, 8, 8);
   DECL_VAL(val, uint, 16, 4);
@@ -78,6 +82,7 @@ FNNAME (INSN_NAME)
   DECL_VARIABLE(vector_res, int, 16, 4);
   DECL_VARIABLE(vector_res, int, 32, 2);
   DECL_VARIABLE(vector_res, int, 64, 1);
+  DECL_VARIABLE(vector_res, float, 16, 4);
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, uint, 8, 8);
   DECL_VARIABLE(vector_res, uint, 16, 4);
@@ -93,6 +98,7 @@ FNNAME (INSN_NAME)
   VECT_VAR(val, int, 16, 4) = 0x123456789abcdef0LL;
   VECT_VAR(val, int, 32, 2) = 0x123456789abcdef0LL;
   VECT_VAR(val, int, 64, 1) = 0x123456789abcdef0LL;
+  VECT_VAR(val, float, 16, 4) = 0x123456789abcdef0LL;
   VECT_VAR(val, float, 32, 2) = 0x123456789abcdef0LL;
   VECT_VAR(val, uint, 8, 8) = 0x123456789abcdef0ULL;
   VECT_VAR(val, uint, 16, 4) = 0x123456789abcdef0ULL;
@@ -104,6 +110,7 @@ FNNAME (INSN_NAME)
   TEST_VCREATE(int, s, 8, 8);
   TEST_VCREATE(int, s, 16, 4);
   TEST_VCREATE(int, s, 32, 2);
+  TEST_VCREATE(float, f, 16, 4);
   TEST_VCREATE(float, f, 32, 2);
   TEST_VCREATE(int, s, 64, 1);
   TEST_VCREATE(uint, u, 8, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
index b5132f41ac424fa1c9c59805efb4a43f2c3dab12..90a7e452489fda14354217266d6cda7165d279f2 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
@@ -19,6 +19,7 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0,
@@ -46,6 +47,8 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					  0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1800000,
 					    0xc1800000, 0xc1800000 };
 
@@ -63,6 +66,7 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1,
@@ -90,6 +94,8 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					  0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
 					    0xc1700000, 0xc1700000 };
 
@@ -107,6 +113,7 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected2,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
 VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2,
@@ -134,6 +141,8 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					  0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected2,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0xc1600000,
 					    0xc1600000, 0xc1600000 };
 
@@ -184,7 +193,6 @@ void exec_vdup_vmov (void)
     TEST_VDUP(q, poly, p, 8, 16);
     TEST_VDUP(q, poly, p, 16, 8);
     TEST_VDUP(q, float, f, 32, 4);
-
     switch (i) {
     case 0:
       CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
index c1ff6dd3007100966a488a33bc714802b987a084..a74d550a1cb1d37ca0d4981f8ec0fd130eaa4b10 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
 					0xf7, 0xf7, 0xf7, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					0xf2, 0xf2, 0xf2, 0xf2,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
 					 0xf5, 0xf5, 0xf5, 0xf5 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL (expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
 					   0xc1700000, 0xc1700000 };
 
@@ -63,6 +66,7 @@ void exec_vdup_lane (void)
   clean_results ();
 
   TEST_MACRO_64BITS_VARIANTS_2_5(VLOAD, vector, buffer);
+  VLOAD(vector, buffer, , float, f, 16, 4);
   VLOAD(vector, buffer, , float, f, 32, 2);
 
   /* Choose lane arbitrarily.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/veor.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/veor.c
index 474b225c7087a73ff771421ae1cced1bb556940d..f6127fa1e28c10f25e59114bc63a8137467c5bf3 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/veor.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/veor.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x6, 0x7, 0x4, 0x5,
 					0x2, 0x3, 0x0, 0x1,
@@ -43,5 +44,7 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
index 0b014ebda87e1486ea6005d597f6dc30b5dfc470..6dcc796111bc66678086d54c910b0036ffd3d7a5 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf6, 0xf7, 0x55, 0x55,
 					0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xfe, 0xff, 0x11, 0x11,
 					0x11, 0x11, 0x11, 0x11,
@@ -39,6 +40,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xfc, 0xfd, 0xfe, 0xff,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff6, 0xfff7, 0x66, 0x66,
 					 0x66, 0x66, 0x66, 0x66 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1500000, 0x4204cccd,
 					   0x4204cccd, 0x4204cccd };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
index 69ad90b7e2e9acab0cb0723c02d94a108fe7a518..19482af7fb0f806336f6828d98a9393b2ebbf174 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -59,6 +62,7 @@ void exec_vget_high (void)
   DECL_VARIABLE_128BITS_VARIANTS(vector128);
 
   TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector128, buffer);
+  VLOAD(vector128, buffer, q, float, f, 16, 8);
   VLOAD(vector128, buffer, q, float, f, 32, 4);
 
   clean_results ();
@@ -74,6 +78,7 @@ void exec_vget_high (void)
   TEST_VGET_HIGH(uint, u, 64, 1, 2);
   TEST_VGET_HIGH(poly, p, 8, 8, 16);
   TEST_VGET_HIGH(poly, p, 16, 4, 8);
+  TEST_VGET_HIGH(float, f, 16, 4, 8);
   TEST_VGET_HIGH(float, f, 32, 2, 4);
 
   CHECK_RESULTS (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
index db4a0019506107155614ec317b6a9265ef5c6a92..d8dc5ae23d97d1f47d39bedd82878e863ed22444 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -59,6 +62,7 @@ void exec_vget_low (void)
   DECL_VARIABLE_128BITS_VARIANTS(vector128);
 
   TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector128, buffer);
+  VLOAD(vector128, buffer, q, float, f, 16, 8);
   VLOAD(vector128, buffer, q, float, f, 32, 4);
 
   clean_results ();
@@ -74,6 +78,7 @@ void exec_vget_low (void)
   TEST_VGET_LOW(uint, u, 64, 1, 2);
   TEST_VGET_LOW(poly, p, 8, 8, 16);
   TEST_VGET_LOW(poly, p, 16, 4, 8);
+  TEST_VGET_LOW(float, f, 16, 4, 8);
   TEST_VGET_LOW(float, f, 32, 2, 4);
 
   CHECK_RESULTS (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
index ced9d736d6d22cbd93352c1972fd9bab81fa747e..68641b0e3ef5a5586c2513fd3f96227eb383a3f3 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7,
@@ -44,6 +45,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+					   0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0xc1600000, 0xc1500000 };
 
@@ -62,7 +65,9 @@ void exec_vld1 (void)
 
   TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1, vector, buffer);
 
+  TEST_VLD1(vector, buffer, , float, f, 16, 4);
   TEST_VLD1(vector, buffer, , float, f, 32, 2);
+  TEST_VLD1(vector, buffer, q, float, f, 16, 8);
   TEST_VLD1(vector, buffer, q, float, f, 32, 4);
 
   CHECK_RESULTS (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
index 0e052743926ca7cdc3108a4e6ef96b54e815de1d..22c512bcf6a690e07eba94a6d7c204ea6f9bf489 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0,
@@ -44,6 +45,8 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					  0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00,
+					    0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1800000,
 					    0xc1800000, 0xc1800000 };
 
@@ -61,6 +64,7 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1,
@@ -88,6 +92,8 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					  0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80,
+					    0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
 					    0xc1700000, 0xc1700000 };
 
@@ -105,6 +111,7 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected2,hfloat,16,4) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
 VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2,
@@ -132,6 +139,8 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					  0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected2,hfloat,16,8) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00,
+					    0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0xc1600000,
 					    0xc1600000, 0xc1600000 };
 
@@ -154,9 +163,11 @@ void exec_vld1_dup (void)
 
     TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1_DUP, vector, buffer_dup);
 
+    TEST_VLD1_DUP(vector, buffer_dup, , float, f, 16, 4);
     TEST_VLD1_DUP(vector, buffer_dup, , float, f, 32, 2);
+    TEST_VLD1_DUP(vector, buffer_dup, q, float, f, 16, 8);
     TEST_VLD1_DUP(vector, buffer_dup, q, float, f, 32, 4);
-
+    
     switch (i) {
     case 0:
       CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
index d5c5d22a8ce80cd7e4e6e46750b6c5297845f366..3b521f7c34296d055eff89060bf3135361ad10a5 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					0xaa, 0xaa, 0xaa, 0xf0 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xaaaa };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xaaaaaaaa, 0xc1800000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					0xaa, 0xaa, 0xaa, 0xaa,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					 0xf0, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					 0xaaaa, 0xaaaa, 0xfff0, 0xaaaa };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+					   0xaaaa, 0xcc00, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 					   0xc1800000, 0xaaaaaaaa };
 
@@ -72,6 +75,7 @@ void exec_vld1_lane (void)
   ARRAY(buffer_src, uint, 64, 1);
   ARRAY(buffer_src, poly, 8, 8);
   ARRAY(buffer_src, poly, 16, 4);
+  ARRAY(buffer_src, float, 16, 4);
   ARRAY(buffer_src, float, 32, 2);
 
   ARRAY(buffer_src, int, 8, 16);
@@ -84,6 +88,7 @@ void exec_vld1_lane (void)
   ARRAY(buffer_src, uint, 64, 2);
   ARRAY(buffer_src, poly, 8, 16);
   ARRAY(buffer_src, poly, 16, 8);
+  ARRAY(buffer_src, float, 16, 8);
   ARRAY(buffer_src, float, 32, 4);
 
   clean_results ();
@@ -99,6 +104,7 @@ void exec_vld1_lane (void)
   TEST_VLD1_LANE(, uint, u, 64, 1, 0);
   TEST_VLD1_LANE(, poly, p, 8, 8, 7);
   TEST_VLD1_LANE(, poly, p, 16, 4, 3);
+  TEST_VLD1_LANE(, float, f, 16, 4, 2);
   TEST_VLD1_LANE(, float, f, 32, 2, 1);
 
   TEST_VLD1_LANE(q, int, s, 8, 16, 15);
@@ -111,6 +117,7 @@ void exec_vld1_lane (void)
   TEST_VLD1_LANE(q, uint, u, 64, 2, 0);
   TEST_VLD1_LANE(q, poly, p, 8, 16, 12);
   TEST_VLD1_LANE(q, poly, p, 16, 8, 6);
+  TEST_VLD1_LANE(q, float, f, 16, 8, 5);
   TEST_VLD1_LANE(q, float, f, 32, 4, 2);
 
   CHECK_RESULTS (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
index fe00640fd7646a350cdb67148cdee55dc16b2078..9e0bb87ae524207d6133bb2dd3dc7dcf49074e79 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
@@ -18,6 +18,7 @@ VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7,
@@ -45,6 +46,8 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -62,6 +65,7 @@ VECT_VAR_DECL(expected_vld2_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld2_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7,
@@ -89,6 +93,8 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
+						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
 						  0xc1200000, 0xc1100000 };
 
@@ -106,6 +112,7 @@ VECT_VAR_DECL(expected_vld3_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7,
@@ -133,6 +140,8 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -150,6 +159,7 @@ VECT_VAR_DECL(expected_vld3_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld3_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7,
@@ -177,6 +187,8 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
+						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
 						  0xc1200000, 0xc1100000 };
 
@@ -197,6 +209,7 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff8, 0xfff9,
 						0xfffa, 0xfffb };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld3_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 					       0x14, 0x15, 0x16, 0x17,
@@ -224,6 +237,8 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 						0x1c, 0x1d, 0x1e, 0x1f };
 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0x0, 0x1, 0x2, 0x3,
 						0x4, 0x5, 0x6, 0x7 };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0x0000, 0x3c00, 0x4000, 0x4200,
+						  0x4400, 0x4500, 0x4600, 0x4700 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0xc1000000, 0xc0e00000,
 						  0xc0c00000, 0xc0a00000 };
 
@@ -243,6 +258,7 @@ VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7,
@@ -270,6 +286,8 @@ VECT_VAR_DECL(expected_vld4_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -287,6 +305,7 @@ VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7,
@@ -314,6 +333,8 @@ VECT_VAR_DECL(expected_vld4_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
+						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
 						  0xc1200000, 0xc1100000 };
 
@@ -331,6 +352,7 @@ VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld4_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 					       0x14, 0x15, 0x16, 0x17,
@@ -358,6 +380,8 @@ VECT_VAR_DECL(expected_vld4_2,poly,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 						0x1c, 0x1d, 0x1e, 0x1f };
 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0x0, 0x1, 0x2, 0x3,
 						0x4, 0x5, 0x6, 0x7 };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0x0000, 0x3c00, 0x4000, 0x4200,
+						  0x4400, 0x4500, 0x4600, 0x4700 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0xc1000000, 0xc0e00000,
 						  0xc0c00000, 0xc0a00000 };
 
@@ -375,6 +399,7 @@ VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0x8, 0x9, 0xa, 0xb,
 					       0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1200000, 0xc1100000 };
 VECT_VAR_DECL(expected_vld4_3,int,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
 					       0x24, 0x25, 0x26, 0x27,
@@ -402,6 +427,8 @@ VECT_VAR_DECL(expected_vld4_3,poly,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
 						0x2c, 0x2d, 0x2e, 0x2f };
 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0x8, 0x9, 0xa, 0xb,
 						0xc, 0xd, 0xe, 0xf };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0x4800, 0x4880, 0x4900, 0x4980,
+						  0x4a00, 0x4a80, 0x4b00, 0x4b80 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0xc0800000, 0xc0400000,
 						  0xc0000000, 0xbf800000 };
 
@@ -445,6 +472,7 @@ void exec_vldX (void)
   DECL_VLDX(uint, 64, 1, X);			\
   DECL_VLDX(poly, 8, 8, X);			\
   DECL_VLDX(poly, 16, 4, X);			\
+  DECL_VLDX(float, 16, 4, X);			\
   DECL_VLDX(float, 32, 2, X);			\
   DECL_VLDX(int, 8, 16, X);			\
   DECL_VLDX(int, 16, 8, X);			\
@@ -454,6 +482,7 @@ void exec_vldX (void)
   DECL_VLDX(uint, 32, 4, X);			\
   DECL_VLDX(poly, 8, 16, X);			\
   DECL_VLDX(poly, 16, 8, X);			\
+  DECL_VLDX(float, 16, 8, X);			\
   DECL_VLDX(float, 32, 4, X)
 
 #define TEST_ALL_VLDX(X)			\
@@ -467,6 +496,7 @@ void exec_vldX (void)
   TEST_VLDX(, uint, u, 64, 1, X);		\
   TEST_VLDX(, poly, p, 8, 8, X);		\
   TEST_VLDX(, poly, p, 16, 4, X);		\
+  TEST_VLDX(, float, f, 16, 4, X);		\
   TEST_VLDX(, float, f, 32, 2, X);		\
   TEST_VLDX(q, int, s, 8, 16, X);		\
   TEST_VLDX(q, int, s, 16, 8, X);		\
@@ -476,6 +506,7 @@ void exec_vldX (void)
   TEST_VLDX(q, uint, u, 32, 4, X);		\
   TEST_VLDX(q, poly, p, 8, 16, X);		\
   TEST_VLDX(q, poly, p, 16, 8, X);		\
+  TEST_VLDX(q, float, f, 16, 8, X);		\
   TEST_VLDX(q, float, f, 32, 4, X)
 
 #define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
@@ -489,6 +520,7 @@ void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(int, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
@@ -498,6 +530,7 @@ void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
   DECL_ALL_VLDX(2);
@@ -526,6 +559,8 @@ void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
+  PAD(buffer_vld2_pad, float, 16, 4);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
   PAD(buffer_vld2_pad, float, 32, 2);
 
@@ -549,6 +584,8 @@ void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
+  PAD(buffer_vld2_pad, float, 16, 8);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
   PAD(buffer_vld2_pad, float, 32, 4);
 
@@ -573,6 +610,8 @@ void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
+  PAD(buffer_vld3_pad, float, 16, 4);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
   PAD(buffer_vld3_pad, float, 32, 2);
 
@@ -596,6 +635,8 @@ void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
+  PAD(buffer_vld3_pad, float, 16, 8);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
   PAD(buffer_vld3_pad, float, 32, 4);
 
@@ -620,6 +661,8 @@ void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
+  PAD(buffer_vld4_pad, float, 16, 4);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
   PAD(buffer_vld4_pad, float, 32, 2);
 
@@ -643,6 +686,8 @@ void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
+  PAD(buffer_vld4_pad, float, 16, 8);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
   PAD(buffer_vld4_pad, float, 32, 4);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
index 53cd8f39a248f372d7b3a95edef5c9a91766f08a..63d3f4889aa5b14c6942e1c3ad348899267ec988 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
@@ -18,6 +18,7 @@ VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = {0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -45,6 +46,8 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -63,6 +66,7 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					       0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_1,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -90,6 +94,8 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -110,6 +116,7 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
 					       0xf1, 0xf2, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff2, 0xfff0 };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xcc00 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -137,6 +144,8 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -157,6 +166,7 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
 					       0xf0, 0xf1, 0xf2, 0xf0 };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff1, 0xfff2,
 						0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xcb80, 0xcb00, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1800000 };
 VECT_VAR_DECL(expected_vld3_1,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -184,6 +194,8 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -204,6 +216,7 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
 					       0xf2, 0xf0, 0xf1, 0xf2 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff2, 0xfff0,
 						0xfff1, 0xfff2 };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xcc00, 0xcb80, 0xcb00 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1700000, 0xc1600000 };
 VECT_VAR_DECL(expected_vld3_2,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -231,6 +244,8 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -249,6 +264,7 @@ VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -276,6 +292,8 @@ VECT_VAR_DECL(expected_vld4_0,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -293,6 +311,7 @@ VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -320,6 +339,8 @@ VECT_VAR_DECL(expected_vld4_1,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -337,6 +358,7 @@ VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_2,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -364,6 +386,8 @@ VECT_VAR_DECL(expected_vld4_2,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -381,6 +405,7 @@ VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_3,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -407,6 +432,8 @@ VECT_VAR_DECL(expected_vld4_3,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 						0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						  0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						  0x33333333, 0x33333333 };
 
@@ -450,6 +477,7 @@ void exec_vldX_dup (void)
   DECL_VLDX_DUP(uint, 64, 1, X);		\
   DECL_VLDX_DUP(poly, 8, 8, X);			\
   DECL_VLDX_DUP(poly, 16, 4, X);		\
+  DECL_VLDX_DUP(float, 16, 4, X);		\
   DECL_VLDX_DUP(float, 32, 2, X)
 
 #define TEST_ALL_VLDX_DUP(X)			\
@@ -463,6 +491,7 @@ void exec_vldX_dup (void)
   TEST_VLDX_DUP(, uint, u, 64, 1, X);		\
   TEST_VLDX_DUP(, poly, p, 8, 8, X);		\
   TEST_VLDX_DUP(, poly, p, 16, 4, X);		\
+  TEST_VLDX_DUP(, float, f, 16, 4, X);		\
   TEST_VLDX_DUP(, float, f, 32, 2, X)
 
 #define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
@@ -476,6 +505,7 @@ void exec_vldX_dup (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y)
 
 
@@ -505,6 +535,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
+  PAD(buffer_vld2_pad, float, 16, 4);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
   PAD(buffer_vld2_pad, float, 32, 2);
 
@@ -528,6 +560,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
+  PAD(buffer_vld2_pad, float, 16, 8);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
   PAD(buffer_vld2_pad, float, 32, 4);
 
@@ -552,6 +586,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
+  PAD(buffer_vld3_pad, float, 16, 4);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
   PAD(buffer_vld3_pad, float, 32, 2);
 
@@ -575,6 +611,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
+  PAD(buffer_vld3_pad, float, 16, 8);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
   PAD(buffer_vld3_pad, float, 32, 4);
 
@@ -599,6 +637,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
+  PAD(buffer_vld4_pad, float, 16, 4);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
   PAD(buffer_vld4_pad, float, 32, 2);
 
@@ -622,6 +662,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
+  PAD(buffer_vld4_pad, float, 16, 8);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
   PAD(buffer_vld4_pad, float, 32, 4);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
index 19910330e01371633fbf41c3bf2f457f6f354784..1d31c3ad5dfc7a2046e43f04e103be9c079eb0ef 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
@@ -20,6 +20,7 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -47,6 +48,8 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa } ;
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -64,6 +67,7 @@ VECT_VAR_DECL(expected_vld2_1,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld2_1,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -91,9 +95,12 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
+
 /* vld3/chunk 0.  */
 VECT_VAR_DECL(expected_vld3_0,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					      0xaa, 0xaa, 0xaa, 0xaa };
@@ -108,6 +115,7 @@ VECT_VAR_DECL(expected_vld3_0,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -135,6 +143,8 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -152,6 +162,7 @@ VECT_VAR_DECL(expected_vld3_1,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xf0, 0xf1, 0xf2, 0xaa };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_1,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -179,6 +190,8 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xc1800000, 0xc1700000 };
 
@@ -196,6 +209,7 @@ VECT_VAR_DECL(expected_vld3_2,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_2,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -223,6 +237,8 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0xfff1, 0xfff2, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80,
+						  0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0xc1600000, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -240,6 +256,7 @@ VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -267,9 +284,10 @@ VECT_VAR_DECL(expected_vld4_0,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
-
 /* vld4/chunk 1.  */
 VECT_VAR_DECL(expected_vld4_1,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					      0xaa, 0xaa, 0xaa, 0xaa };
@@ -284,6 +302,7 @@ VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -311,6 +330,8 @@ VECT_VAR_DECL(expected_vld4_1,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -328,7 +349,9 @@ VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
+
 VECT_VAR_DECL(expected_vld4_2,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -355,6 +378,8 @@ VECT_VAR_DECL(expected_vld4_2,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -372,6 +397,7 @@ VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_3,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					       0x33, 0x33, 0x33, 0x33,
@@ -399,6 +425,8 @@ VECT_VAR_DECL(expected_vld4_3,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 						0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -414,6 +442,7 @@ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
+VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
 
 /* Input buffers for vld3_lane */
@@ -427,6 +456,7 @@ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
+VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
 
 /* Input buffers for vld4_lane */
@@ -440,6 +470,7 @@ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
+VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
 
 void exec_vldX_lane (void)
@@ -497,7 +528,9 @@ void exec_vldX_lane (void)
   DECL_VLDX_LANE(uint, 16, 8, X);		\
   DECL_VLDX_LANE(uint, 32, 4, X);		\
   DECL_VLDX_LANE(poly, 16, 8, X);		\
+  DECL_VLDX_LANE(float, 16, 4, X);		\
   DECL_VLDX_LANE(float, 32, 2, X);		\
+  DECL_VLDX_LANE(float, 16, 8, X);		\
   DECL_VLDX_LANE(float, 32, 4, X)
 
   /* Add some padding to try to catch out of bound accesses.  */
@@ -522,7 +555,9 @@ void exec_vldX_lane (void)
   TEST_VLDX_LANE(q, uint, u, 16, 8, X, 5);	\
   TEST_VLDX_LANE(q, uint, u, 32, 4, X, 0);	\
   TEST_VLDX_LANE(q, poly, p, 16, 8, X, 5);	\
+  TEST_VLDX_LANE(, float, f, 16, 4, X, 2);	\
   TEST_VLDX_LANE(, float, f, 32, 2, X, 0);	\
+  TEST_VLDX_LANE(q, float, f, 16, 8, X, 6);	\
   TEST_VLDX_LANE(q, float, f, 32, 4, X, 2)
 
 #define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
@@ -539,7 +574,9 @@ void exec_vldX_lane (void)
   TEST_EXTRA_CHUNK(uint, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
   /* Declare the temporary buffers / variables.  */
@@ -561,7 +598,9 @@ void exec_vldX_lane (void)
   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
+  DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
   DUMMY_ARRAY(buffer_src, float, 32, 2, 4);
+  DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
 
   /* Check vld2_lane/vld2q_lane.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
index 7527861ffbf10fb86ef23db67510dd2cb0df88b9..cc88f2d6013896c4cc5b9ca4b0f1fb9f40d28da0 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xc0, 0x84, 0x48, 0xc,
 					0xd0, 0x94, 0x58, 0x1c };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc4053333, 0xc3f9c000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x90, 0x7, 0x7e, 0xf5,
 					0x6c, 0xe3, 0x5a, 0xd1,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x60, 0xca, 0x34, 0x9e,
 					 0x98, 0x32, 0xcc, 0x66 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc4c73333, 0xc4bac000,
 					   0xc4ae4ccd, 0xc4a1d999 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
index c45492ddb0068a12b9fc607175bf8d20034fea5c..9260090ed4ccbd9aa20940b65ab0ebb5053a0ba9 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
@@ -21,6 +21,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x10, 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9,
 					0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1 };
@@ -45,6 +46,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorn.c
index 6905cb663f424c9e102a25e751b0b64c3c7652fa..68bbb9d66dc993738d275347f8ea5e687849ff4e 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorn.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorn.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffffd };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf9, 0xf9, 0xfb, 0xfb,
 					0xfd, 0xfd, 0xff, 0xff,
@@ -44,5 +45,7 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorr.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorr.c
index b2a7dff4d3fe5431fea28c19135674adea91edea..ba973f92e62d351b4dddcede814238ff77a1b5ff 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorr.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vorr.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf6, 0xf7, 0xf6, 0xf7,
 					0xf6, 0xf7, 0xf6, 0xf7,
@@ -44,5 +45,7 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqadd.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqadd.c
index c07f5ff00466118f91bd91b5b37c6bd98af44760..6bb0a3f5da40ab83bcb7838fea943a8bf2d62893 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqadd.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqadd.c
@@ -42,6 +42,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xffffffffffffffff };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x1, 0x2, 0x3, 0x4,
 					0x5, 0x6, 0x7, 0x8,
@@ -67,6 +68,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqsub.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqsub.c
index 04df5feac27f960e31bd3fd5a45b9e69830f39f8..8d061ae227d5d398fdacbab8c7601c727302188d 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqsub.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqsub.c
@@ -28,6 +28,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xffffffffffffff68 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xdf, 0xe0, 0xe1, 0xe2,
 					0xe3, 0xe4, 0xe5, 0xe6,
@@ -55,6 +56,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
index e64d6e37c6e7ffa81f9a8fcdf74410dcaac4751b..9e256f79d4eb76cc9cd19093c0048d7eeb90f0b7 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333,
 					 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x0, 0x20, 0x40, 0x60,
 					0x80, 0xa0, 0xc0, 0xe0,
@@ -42,6 +43,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -60,6 +63,8 @@ VECT_VAR_DECL(expected_large_shift,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 						    0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_large_shift,poly,16,4) [] = { 0x3333, 0x3333,
 						     0x3333, 0x3333 };
+VECT_VAR_DECL(expected_large_shift,hfloat,16,4) [] = { 0x3333, 0x3333,
+						       0x3333, 0x3333 };
 VECT_VAR_DECL(expected_large_shift,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected_large_shift,int,8,16) [] = { 0x0, 0x0, 0x0, 0x0,
 						    0x0, 0x0, 0x0, 0x0,
@@ -85,6 +90,10 @@ VECT_VAR_DECL(expected_large_shift,poly,16,8) [] = { 0x3333, 0x3333,
 						     0x3333, 0x3333,
 						     0x3333, 0x3333,
 						     0x3333, 0x3333 };
+VECT_VAR_DECL(expected_large_shift,hfloat,16,8) [] = { 0x3333, 0x3333,
+						       0x3333, 0x3333,
+						       0x3333, 0x3333,
+						       0x3333, 0x3333 };
 VECT_VAR_DECL(expected_large_shift,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						       0x33333333, 0x33333333 };
 
@@ -107,6 +116,8 @@ VECT_VAR_DECL(expected_negative_shift,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 						       0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected_negative_shift,poly,16,4) [] = { 0x3333, 0x3333,
 							0x3333, 0x3333 };
+VECT_VAR_DECL(expected_negative_shift,hfloat,16,4) [] = { 0x3333, 0x3333,
+							  0x3333, 0x3333 };
 VECT_VAR_DECL(expected_negative_shift,hfloat,32,2) [] = { 0x33333333,
 							  0x33333333 };
 VECT_VAR_DECL(expected_negative_shift,int,8,16) [] = { 0xfc, 0xfc, 0xfc, 0xfc,
@@ -141,6 +152,10 @@ VECT_VAR_DECL(expected_negative_shift,poly,16,8) [] = { 0x3333, 0x3333,
 							0x3333, 0x3333,
 							0x3333, 0x3333,
 							0x3333, 0x3333 };
+VECT_VAR_DECL(expected_negative_shift,hfloat,16,8) [] = { 0x3333, 0x3333,
+							  0x3333, 0x3333,
+							  0x3333, 0x3333,
+							  0x3333, 0x3333 };
 VECT_VAR_DECL(expected_negative_shift,hfloat,32,4) [] = { 0x33333333,
 							  0x33333333,
 							  0x33333333,
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
index eb06ce0a125ae5fd14122d36c5312ebf5b7d0b68..e92ae9c6151d146510a13be25d2498c1b9563ac0 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
@@ -23,6 +23,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x10 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x50, 0x51, 0x52, 0x53,
 					0x50, 0x51, 0x52, 0x53 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x7bf0, 0x7bf1, 0x7bf2, 0x7bf3 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xd0, 0xd1, 0xd2, 0xd3,
 					0xd4, 0xd5, 0xd6, 0xd7,
@@ -48,6 +49,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x60, 0x61, 0x62, 0x63,
 					 0x64, 0x65, 0x66, 0x67 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3ff0, 0x3ff1, 0x3ff2, 0x3ff3,
 					 0x3ff4, 0x3ff5, 0x3ff6, 0x3ff7 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -68,6 +71,8 @@ VECT_VAR_DECL(expected_max_shift,poly,8,8) [] = { 0x70, 0x71, 0x72, 0x73,
 						  0x74, 0x75, 0x76, 0x77 };
 VECT_VAR_DECL(expected_max_shift,poly,16,4) [] = { 0x7ff0, 0x7ff1,
 						   0x7ff2, 0x7ff3 };
+VECT_VAR_DECL(expected_max_shift,hfloat,16,4) [] = { 0x3333, 0x3333,
+						     0x3333, 0x3333 };
 VECT_VAR_DECL(expected_max_shift,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected_max_shift,int,8,16) [] = { 0x70, 0x71, 0x72, 0x73,
 						  0x74, 0x75, 0x76, 0x77,
@@ -95,6 +100,8 @@ VECT_VAR_DECL(expected_max_shift,poly,8,16) [] = { 0x70, 0x71, 0x72, 0x73,
 						   0x7c, 0x7d, 0x7e, 0x7f };
 VECT_VAR_DECL(expected_max_shift,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						   0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_max_shift,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						     0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_max_shift,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						     0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
index 046b79f29407142996cdae8751079fd99d90107b..ebaefb7ec16510f5dac6d8f332eb542bf432fce4 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
@@ -23,6 +23,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xe000000000000000 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xc5, 0xc5, 0xc5, 0xc5,
 					0xc5, 0xc5, 0xc5, 0xc5 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xffc0, 0xffc0, 0xffc0, 0xffc0 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
 					0xf7, 0xf7, 0xf7, 0xf7,
@@ -50,6 +51,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xe1, 0xe1, 0xe1, 0xe1,
 					 0xe1, 0xe1, 0xe1, 0xe1 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -70,6 +73,8 @@ VECT_VAR_DECL(expected_max_shift,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						  0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_max_shift,poly,16,4) [] = { 0xfff0, 0xfff1,
 						   0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_max_shift,hfloat,16,4) [] = { 0x3333, 0x3333,
+						     0x3333, 0x3333 };
 VECT_VAR_DECL(expected_max_shift,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected_max_shift,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						  0xf4, 0xf5, 0xf6, 0xf7,
@@ -97,6 +102,8 @@ VECT_VAR_DECL(expected_max_shift,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						   0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_max_shift,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						   0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_max_shift,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+						     0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected_max_shift,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 						     0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c
index 7620479d3d1836ae8bd6b7e05a2f6c4642df4301..f5a2010f293e87f6187bfbdc0237e1a05811a480 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c
@@ -21,6 +21,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xffffffffffffffee };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xfa, 0xfb, 0xfc, 0xfd,
 					0xfe, 0xff, 0x0, 0x1,
@@ -47,6 +48,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					  0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubl.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubl.c
index b765b2befb5a2e62fbcfae5bfa11a584886f7871..ccc250dcb6b8f9f75e6e80996bd55e61273c1680 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubl.c
@@ -19,6 +19,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -42,6 +43,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubw.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubw.c
index 3e8bc98b121889d91f8b2a0f9cf97d0f5c4b63cb..c9f55def129d2f82113349292b36629f0e8caec1 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubw.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubw.c
@@ -19,6 +19,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x33333333, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -44,6 +45,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x33, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x33333333, 0x33333333,
 					   0x33333333, 0x33333333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
index be18c0f4b3aa2bfda0dd97623be58f52a3637c89..fc26469d950347454ab316e4a6dafb4067a2aebe 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0x55, 0x55,
 					 0xf2, 0xf3, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x66, 0x66 };
+VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf1, 0x11, 0x11,
 					 0xf2, 0xf3, 0x11, 0x11,
@@ -42,6 +43,8 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf1, 0x55, 0x55,
 					  0xf6, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1, 0x66, 0x66,
 					  0xfff2, 0xfff3, 0x66, 0x66 };
+VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					    0x42073333, 0x42073333 };
 
@@ -59,6 +62,7 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf4, 0xf5, 0x55, 0x55,
 					 0xf6, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
+VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf8, 0xf9, 0x11, 0x11,
 					 0xfa, 0xfb, 0x11, 0x11,
@@ -84,6 +88,8 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf8, 0xf9, 0x55, 0x55,
 					  0xfe, 0xff, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff4, 0xfff5, 0x66, 0x66,
 					  0xfff6, 0xfff7, 0x66, 0x66 };
+VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1600000, 0xc1500000,
 					    0x42073333, 0x42073333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
index ec9ded361584acc05f5aece9aa9fc1df257b2b7e..e72e19d78dc8d9418d09e219cfd7ebe7187ae12a 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
@@ -21,6 +21,7 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1,
 					  0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xf4, 0xf5, 0xf6, 0xf7,
@@ -54,6 +55,8 @@ VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1,
 					  0xfff2, 0xfff3,
 					  0xfff4, 0xfff5,
 					  0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					    0xc1600000, 0xc1500000 };
 
@@ -71,6 +74,7 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0x3333333333333333 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0x55, 0x55, 0x55, 0x55,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0x66, 0x66, 0x66, 0x66 };
+VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0x11, 0x11, 0x11, 0x11,
 					 0x11, 0x11, 0x11, 0x11,
@@ -96,6 +100,8 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0x55, 0x55, 0x55, 0x55,
 					  0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0x66, 0x66, 0x66, 0x66,
 					  0x66, 0x66, 0x66, 0x66 };
+VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0x42073333, 0x42073333,
 					    0x42073333, 0x42073333 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
index 05faa8a8c6c73c9a30eba273edee82bae58367cc..0369a8d42bb89e93f6e9aac1c34a2bdba266af78 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
@@ -20,6 +20,7 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf4, 0x55, 0x55,
 					 0xf1, 0xf5, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff2,
 					  0x66, 0x66 };
+VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf8, 0x11, 0x11,
 					 0xf1, 0xf9, 0x11, 0x11,
@@ -47,6 +48,8 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf8, 0x55, 0x55,
 					  0xf3, 0xfb, 0x55, 0x55 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff4, 0x66, 0x66,
 					  0xfff1, 0xfff5, 0x66, 0x66 };
+VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1600000,
 					    0x42073333, 0x42073333 };
 
@@ -67,6 +70,7 @@ VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf2, 0xf6, 0x55, 0x55,
 					 0xf3, 0xf7, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff3,
 					  0x66, 0x66 };
+VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf4, 0xfc, 0x11, 0x11,
 					 0xf5, 0xfd, 0x11, 0x11,
@@ -94,6 +98,8 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf4, 0xfc, 0x55, 0x55,
 					  0xf7, 0xff, 0x55, 0x55 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff2, 0xfff6, 0x66, 0x66,
 					  0xfff3, 0xfff7, 0x66, 0x66 };
+VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0x3333, 0x3333, 0x3333, 0x3333,
+					    0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1500000,
 					    0x42073333, 0x42073333 };
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 11/14][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (9 preceding siblings ...)
  2015-04-22 17:21 ` [PATCH 10/14][AArch64] Add vcvt(_high)?_f32_f16 intrinsics Alan Lawrence
@ 2015-04-22 17:31 ` Alan Lawrence
  2015-04-22 17:31 ` [PATCH 12/14][ARM/AArch64 Testsuite] Update advsimd-intrinsics tests to add float16 vectors Alan Lawrence
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:31 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 864 bytes --]

native_interpret_real in fold_const.c has an assumption that floats are at least 
32-bits (on bigendian targets with UNITS_PER_WORD >= 4). This patch relaxes that 
assumption (allowing e.g. 16-bit HFmode values).

On aarch64_be-none-elf, this fixes the float16x4_t variant of 
gcc.target/aarch64/advsimd-intrinsics/vcreate.c added in the next patch in series.

Bootstrapped + check-gcc on:
x86-unknown-linux-gnu
powerpc64-unknown-linux-gnu (gcc110 on compile farm, as I believe this is 
bigendian, as opposed to powerpc64le-unknown-linux-gnu on gcc112)
arm-none-linux-gnueabihf
aarch64-none-elf

It's not clear that any of those actually test this code, but the 
aarch64_be-none-elf tests in next patch definitely do ;).

gcc/ChangeLog:

	fold-const.c (native_interpret_real): Correctly read floats of less
	than 32 bits on bigendian targets.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 11_fold_const_bigendian_hfmode.patch --]
[-- Type: text/x-patch; name=11_fold_const_bigendian_hfmode.patch, Size: 787 bytes --]

commit f8ad02fecdb7b6f91bab77cc154a246bd719ac20
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Thu Apr 9 10:54:40 2015 +0100

    Fix native_interpret_real for HFmode floats on Bigendian with UNITS_PER_WORD>=4
    
    (with missing space)

diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index 6d085b1..52bc8e9 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -7625,7 +7625,7 @@ native_interpret_real (tree type, const unsigned char *ptr, int len)
 	    offset += byte % UNITS_PER_WORD;
 	}
       else
-	offset = BYTES_BIG_ENDIAN ? 3 - byte : byte;
+	offset = BYTES_BIG_ENDIAN ? MIN (3, total_bytes - 1) - byte : byte;
       value = ptr[offset + ((bitpos / BITS_PER_UNIT) & ~3)];
 
       tmp[bitpos / 32] |= (unsigned long)value << (bitpos & 31);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (11 preceding siblings ...)
  2015-04-22 17:31 ` [PATCH 12/14][ARM/AArch64 Testsuite] Update advsimd-intrinsics tests to add float16 vectors Alan Lawrence
@ 2015-04-22 17:36 ` Alan Lawrence
  2015-05-25 12:43   ` Christophe Lyon
  2015-04-22 17:38 ` [PATCH 14/14][ARM/AArch64 testsuite] Test float16_t vcvt_* intrinsics Alan Lawrence
  2015-05-08 12:56 ` [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
  14 siblings, 1 reply; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: Christophe Lyon

[-- Attachment #1: Type: text/plain, Size: 1883 bytes --]

In the first revision of Christophe Lyon's advsimd-intrinsics tests, 
https://gcc.gnu.org/ml/gcc-patches/2014-06/msg00532.html , both gcc-dg-runtest 
(to assemble only) and c-torture-execute were used. In review the gcc-dg-runtest 
part was then dropped, and execution tests continued using c-torture-execute. 
However, c-torture-execute ignores e.g. dg-options directives in the individual 
test files, whereas gcc-dg-runtest does not.

This patch switches to gcc-dg-runtest (with dg-do-what-default = "run") for all 
tests, allowing use of e.g. dg-options (in testsuite patch 3/3). This generally 
seems to work OK - indeed I also dropped the parallelization-disabling code - 
and the new advsimd-intrinsics.exp now follows gcc.c-torture/compile/compile.exp 
and gcc.c-torture/execute/execute.exp very closely. However, there are side 
effects, of which we should be aware, but with which I think we can live, 
specifically:

(1) the lines in the test log change from...

PASS: gcc.target/aarch64/advsimd-intrinsics/vcombine.c compilation,  -O1
PASS: gcc.target/aarch64/advsimd-intrinsics/vcombine.c execution,  -O1

...to...

PASS: gcc.target/aarch64/advsimd-intrinsics/vcombine.c   -O1  execution test

(that is, the compilation line disappears, but the (test for excess errors) 
remains unchanged)

(2) The "-Og -g" variant is no longer tested;  all of -O0, -O1, -O2, -O2 -flto 
-fno-use-linker-plugin -flto-partition=none, -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects, -O3 -fomit-frame-pointer, -O3 -g, -Os still are. My 
feeling is that this set of options is exhaustive enough.

Cross-tested arm-none-eabi, aarch64-none-elf, aarch64_be-none-elf; natively 
tested arm-none-linux-gnueabihf and aarch64-none-linux-gnu.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp:
	Use gcc-dg-runtest.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 13_advsimd_gcc-dg-runtest.patch --]
[-- Type: text/x-patch; name=13_advsimd_gcc-dg-runtest.patch, Size: 1878 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
index 551299ef5634b7fea68d2e2f813ab61270b59e35..aa5d4d5ec2f45fa745fac152125e1badc2f2df43 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
@@ -26,10 +26,6 @@ if {![istarget arm*-*-*]
 load_lib gcc-dg.exp
 
 # Initialize `dg'.
-load_lib c-torture.exp
-load_lib target-supports.exp
-load_lib torture-options.exp
-
 dg-init
 
 if {[istarget arm*-*-*]
@@ -37,29 +33,14 @@ if {[istarget arm*-*-*]
   return
 }
 
-torture-init
-set-torture-options $C_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS
-
 # Make sure Neon flags are provided, if necessary.
-set additional_flags [add_options_for_arm_neon ""]
+set additional_flags [add_options_for_arm_neon "-w"]
 
 # Main loop.
-foreach src [lsort [glob -nocomplain $srcdir/$subdir/*.c]] {
-    # If we're only testing specific files and this isn't one of them, skip it.
-    if ![runtest_file_p $runtests $src] then {
-	continue
-    }
-
-    # runtest_file_p is already run above, and the code below can run
-    # runtest_file_p again, make sure everything for this test is
-    # performed if the above runtest_file_p decided this runtest
-    # instance should execute the test
-    gcc_parallel_test_enable 0
-    c-torture-execute $src $additional_flags
-    gcc-dg-runtest $src "" $additional_flags
-    gcc_parallel_test_enable 1
-}
+set saved-dg-do-what-default ${dg-do-what-default}
+set dg-do-what-default "run"
+gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] "" ${additional_flags}
+set dg-do-what-default ${saved-dg-do-what-default}
 
 # All done.
-torture-finish
 dg-finish

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 14/14][ARM/AArch64 testsuite] Test float16_t vcvt_* intrinsics
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (12 preceding siblings ...)
  2015-04-22 17:36 ` [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp Alan Lawrence
@ 2015-04-22 17:38 ` Alan Lawrence
  2015-05-25 13:31   ` Christophe Lyon
  2015-05-08 12:56 ` [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
  14 siblings, 1 reply; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:38 UTC (permalink / raw)
  To: gcc-patches; +Cc: Christophe Lyon

[-- Attachment #1: Type: text/plain, Size: 790 bytes --]

This adds a test of vcvt_f32_f16 and vcvt_f16_f32, also vcvt_high_f32_f16 and 
vcvt_high_f16_f32.

On ARM, we pass additional option -mfpu=neon-fp16 to the compiler (possible 
following patch 2/3). The compiler is already receiving an option such as 
-mfpu=neon or -mfpu=crypto-neon-fp-armv8, but passing neon-fp16 as well as 
either of those appears to do no harm, and turns on the superset of all -mfpu 
options, as desired.

On AArch64, we additionally test vcvt_high_f32_f16 and vcvt_high_f16_f32; these 
are not tested on ARM as the relevant intrinsics do not exist in 32-bit state.

Passing on aarch64_be-none-elf, aarch64-none-elf, arm-none-linux-gnueabi, 
aarch64-none-linux-gnu.

gcc/testsuite/ChangeLog:
	 * gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c: New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 14_advsimd_vcvt.patch --]
[-- Type: text/x-patch; name=14_advsimd_vcvt.patch, Size: 3432 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..a346b3d72e13d5b2028de5ae7b88f910dcb3f862
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c
@@ -0,0 +1,96 @@
+/* { dg-additional-options "-mfpu=neon-fp16" { target { arm*-*-* } } } */
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include <math.h>
+
+/* Expected results for vcvt.  */
+VECT_VAR_DECL (expected,hfloat,32,4) [] = { 0x41800000, 0x41700000,
+					    0x41600000, 0x41500000 };
+VECT_VAR_DECL (expected,hfloat,16,4) [] = { 0x3e00, 0x4100, 0x4300, 0x4480 };
+
+/* Expected results for vcvt_high_f32_f16.  */
+VECT_VAR_DECL (expected_high,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
+						 0xc1200000, 0xc1100000 };
+/* Expected results for vcvt_high_f16_f32.  */
+VECT_VAR_DECL (expected_high,hfloat,16,8) [] = { 0x4000, 0x4000, 0x4000, 0x4000,
+						 0xcc00, 0xcb80, 0xcb00, 0xca80 };
+
+void
+exec_vcvt (void)
+{
+#define TEST_MSG vcvt_f32_f16
+  {
+    VECT_VAR_DECL (buffer_src, float, 16, 4) [] = { 16.0, 15.0, 14.0, 13.0 };
+
+    DECL_VARIABLE (vector_src, float, 16, 4);
+
+    VLOAD (vector_src, buffer_src, , float, f, 16, 4);
+    DECL_VARIABLE (vector_res, float, 32, 4) =
+	vcvt_f32_f16 (VECT_VAR (vector_src, float, 16, 4));
+    vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	       VECT_VAR (vector_res, float, 32, 4));
+
+    CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected, "");
+  }
+#undef TEST_MSG
+
+  clean_results ();
+
+#define TEST_MSG vcvt_f16_f32
+  {
+    VECT_VAR_DECL (buffer_src, float, 32, 4) [] = { 1.5, 2.5, 3.5, 4.5 };
+    DECL_VARIABLE (vector_src, float, 32, 4);
+
+    VLOAD (vector_src, buffer_src, q, float, f, 32, 4);
+    DECL_VARIABLE (vector_res, float, 16, 4) =
+      vcvt_f16_f32 (VECT_VAR (vector_src, float, 32, 4));
+    vst1_f16 (VECT_VAR (result, float, 16, 4),
+	      VECT_VAR (vector_res, float, 16 ,4));
+
+    CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  }
+#undef TEST_MSG
+
+#ifdef __ARM_64BIT_STATE
+  clean_results ();
+
+#define TEST_MSG "vcvt_high_f32_f16"
+  {
+    DECL_VARIABLE (vector_src, float, 16, 8);
+    VLOAD (vector_src, buffer, q, float, f, 16, 8);
+    DECL_VARIABLE (vector_res, float, 32, 4);
+    VECT_VAR (vector_res, float, 32, 4) =
+      vcvt_high_f32_f16 (VECT_VAR (vector_src, float, 16, 8));
+    vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	       VECT_VAR (vector_res, float, 32, 4));
+    CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected_high, "");
+  }
+#undef TEST_MSG
+  clean_results ();
+
+#define TEST_MSG "vcvt_high_f16_f32"
+  {
+    DECL_VARIABLE (vector_low, float, 16, 4);
+    VDUP (vector_low, , float, f, 16, 4, 2.0);
+
+    DECL_VARIABLE (vector_src, float, 32, 4);
+    VLOAD (vector_src, buffer, q, float, f, 32, 4);
+
+    DECL_VARIABLE (vector_res, float, 16, 8) =
+      vcvt_high_f16_f32 (VECT_VAR (vector_low, float, 16, 4),
+			 VECT_VAR (vector_src, float, 32, 4));
+    vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	       VECT_VAR (vector_res, float, 16, 8));
+
+    CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_high, "");
+  }
+#endif
+}
+
+int
+main (void)
+{
+  exec_vcvt ();
+  return 0;
+}

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/14][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate
  2015-04-22 17:14 ` [PATCH 7/14][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate Alan Lawrence
@ 2015-04-22 17:40   ` Alan Lawrence
  0 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-04-22 17:40 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1257 bytes --]

Alan Lawrence wrote:
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64.c (aarch64_split_simd_combine): Add V4HFmode.
> 	* config/aarch64/aarch64-builtins.c (VAR13, VAR14): New.
> 	(aarch64_scalar_builtin_types, aarch64_init_simd_builtin_scalar_types):
> 	Add __builtin_aarch64_simd_hf.
> 	* config/aarch64/arm_neon.h (float16x4x2_t, float16x8x2_t,
> 	float16x4x3_t, float16x8x3_t, float16x4x4_t, float16x8x4_t,
> 	vcombine_f16, vst2_lane_f16, vst2q_lane_f16, vst3_lane_f16,
> 	vst3q_lane_f16, vst4_lane_f16, vst4q_lane_f16, vld2_f16, vld2q_f16,
> 	vld3_f16, vld3q_f16, vld4_f16, vld4q_f16, vld2_dup_f16, vld2q_dup_f16,
> 	vld3_dup_f16, vld3q_dup_f16, vld4_dup_f16, vld4q_dup_f16,
> 	vld2_lane_f16, vld2q_lane_f16, vld3_lane_f16, vld3q_lane_f16,
> 	vld4_lane_f16, vld4q_lane_f16, vst2_f16, vst2q_f16, vst3_f16,
> 	vst3q_f16, vst4_f16, vst4q_f16, vcreate_f16): New.
> 
> 	* config/aarch64/iterators.md (VALLDIF, Vtype, Vetype, Vbtype,
> 	V_cmp_result, v_cmp_result): Add cases for V4HF and V8HF.
> 	(VDC, Vdbl): Add V4HF.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/vldN_1.c: Add float16x4_t and float16x8_t cases.
> 	* gcc.target/aarch64/vldN_dup_1.c: Likewise.
> 	* gcc.target/aarch64/vldN_lane_1.c: Likewise.
> 
> 


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 07_aarch64_vldst_combinecreate.patch --]
[-- Type: text/x-patch; name=07_aarch64_vldst_combinecreate.patch, Size: 30321 bytes --]

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index ac05c43faf47e2de6b20a976defe2b269f7f1633..e791533f4c9e26a0c0c5ddf78f015f334f1ca2ed 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -314,6 +314,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define VAR12(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR11 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, MAP, L)
+#define VAR13(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, MAP, M)
+#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR1 (T, X, MAP, N)
 
 #include "aarch64-builtin-iterators.h"
 
@@ -391,6 +397,7 @@ const char *aarch64_scalar_builtin_types[] = {
   "__builtin_aarch64_simd_qi",
   "__builtin_aarch64_simd_hi",
   "__builtin_aarch64_simd_si",
+  "__builtin_aarch64_simd_hf",
   "__builtin_aarch64_simd_sf",
   "__builtin_aarch64_simd_di",
   "__builtin_aarch64_simd_df",
@@ -678,6 +685,8 @@ aarch64_init_simd_builtin_scalar_types (void)
 					     "__builtin_aarch64_simd_qi");
   (*lang_hooks.types.register_builtin_type) (intHI_type_node,
 					     "__builtin_aarch64_simd_hi");
+  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node,
+					     "__builtin_aarch64_simd_hf");
   (*lang_hooks.types.register_builtin_type) (intSI_type_node,
 					     "__builtin_aarch64_simd_si");
   (*lang_hooks.types.register_builtin_type) (float_type_node,
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 13debc8f94b54873bbe8cfda0a8e00921489d9c3..425bcbdc47e3b08d456b02e823a4b370e0fc6312 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1052,6 +1052,9 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 	case V2SImode:
 	  gen = gen_aarch64_simd_combinev2si;
 	  break;
+	case V4HFmode:
+	  gen = gen_aarch64_simd_combinev4hf;
+	  break;
 	case V2SFmode:
 	  gen = gen_aarch64_simd_combinev2sf;
 	  break;
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index dca74025c80eb82f6f56e43ec009ceb0803de6e9..935041297ac6878292c81d5db0d70674def21f03 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -153,6 +153,16 @@ typedef struct uint64x2x2_t
   uint64x2_t val[2];
 } uint64x2x2_t;
 
+typedef struct float16x4x2_t
+{
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t
+{
+  float16x8_t val[2];
+} float16x8x2_t;
+
 typedef struct float32x2x2_t
 {
   float32x2_t val[2];
@@ -273,6 +283,16 @@ typedef struct uint64x2x3_t
   uint64x2_t val[3];
 } uint64x2x3_t;
 
+typedef struct float16x4x3_t
+{
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t
+{
+  float16x8_t val[3];
+} float16x8x3_t;
+
 typedef struct float32x2x3_t
 {
   float32x2_t val[3];
@@ -393,6 +413,16 @@ typedef struct uint64x2x4_t
   uint64x2_t val[4];
 } uint64x2x4_t;
 
+typedef struct float16x4x4_t
+{
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t
+{
+  float16x8_t val[4];
+} float16x8x4_t;
+
 typedef struct float32x2x4_t
 {
   float32x2_t val[4];
@@ -2644,6 +2674,12 @@ vcreate_s64 (uint64_t __a)
   return (int64x1_t) {__a};
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcreate_f16 (uint64_t __a)
+{
+  return (float16x4_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcreate_f32 (uint64_t __a)
 {
@@ -4780,6 +4816,12 @@ vcombine_s64 (int64x1_t __a, int64x1_t __b)
   return __builtin_aarch64_combinedi (__a[0], __b[0]);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcombine_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_combinev4hf (__a, __b);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vcombine_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -9910,7 +9952,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
    +------+----+----+----+----+
    |uint  | Y  | Y  | N  | N  |
    +------+----+----+----+----+
-   |float | -  | -  | N  | N  |
+   |float | -  | Y  | N  | N  |
    +------+----+----+----+----+
    |poly  | Y  | Y  | -  | -  |
    +------+----+----+----+----+
@@ -9924,7 +9966,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
    +------+----+----+----+----+
    |uint  | Y  | Y  | Y  | Y  |
    +------+----+----+----+----+
-   |float | -  | -  | Y  | Y  |
+   |float | -  | Y  | Y  | Y  |
    +------+----+----+----+----+
    |poly  | Y  | Y  | -  | -  |
    +------+----+----+----+----+
@@ -9938,7 +9980,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
    +------+----+----+----+----+
    |uint  | Y  | N  | N  | Y  |
    +------+----+----+----+----+
-   |float | -  | -  | N  | Y  |
+   |float | -  | N  | N  | Y  |
    +------+----+----+----+----+
    |poly  | Y  | N  | -  | -  |
    +------+----+----+----+----+
@@ -9954,6 +9996,7 @@ __STRUCTN (int, 8, 2)
 __STRUCTN (int, 16, 2)
 __STRUCTN (uint, 8, 2)
 __STRUCTN (uint, 16, 2)
+__STRUCTN (float, 16, 2)
 __STRUCTN (poly, 8, 2)
 __STRUCTN (poly, 16, 2)
 /* 3-element structs.  */
@@ -9965,6 +10008,7 @@ __STRUCTN (uint, 8, 3)
 __STRUCTN (uint, 16, 3)
 __STRUCTN (uint, 32, 3)
 __STRUCTN (uint, 64, 3)
+__STRUCTN (float, 16, 3)
 __STRUCTN (float, 32, 3)
 __STRUCTN (float, 64, 3)
 __STRUCTN (poly, 8, 3)
@@ -10002,6 +10046,8 @@ vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
 				     __ptr, __o, __c);			     \
 }
 
+__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v8hf, hf, f16,
+		 float16x8_t)
 __ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v4sf, sf, f32,
 		 float32x4_t)
 __ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, v2df, df, f64,
@@ -10034,6 +10080,7 @@ vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
 				    __ptr, __temp.__o, __c);		    \
 }
 
+__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
 __ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
 __ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
 __ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
@@ -10075,6 +10122,8 @@ vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
 				     __ptr, __o, __c);			     \
 }
 
+__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v8hf, hf, f16,
+		 float16x8_t)
 __ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v4sf, sf, f32,
 		 float32x4_t)
 __ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, v2df, df, f64,
@@ -10107,6 +10156,7 @@ vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
 				    __ptr, __temp.__o, __c);		    \
 }
 
+__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
 __ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
 __ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
 __ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
@@ -10153,6 +10203,8 @@ vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
 				     __ptr, __o, __c);			     \
 }
 
+__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v8hf, hf, f16,
+		 float16x8_t)
 __ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v4sf, sf, f32,
 		 float32x4_t)
 __ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, v2df, df, f64,
@@ -10185,6 +10237,7 @@ vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
 				    __ptr, __temp.__o, __c);		    \
 }
 
+__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
 __ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
 __ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
 __ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
@@ -15241,6 +15294,17 @@ vld2_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_f16 (const float16_t * __a)
+{
+  float16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_f32 (const float32_t * __a)
 {
@@ -15362,6 +15426,17 @@ vld2q_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_f16 (const float16_t * __a)
+{
+  float16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_f32 (const float32_t * __a)
 {
@@ -15516,6 +15591,18 @@ vld3_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_f16 (const float16_t * __a)
+{
+  float16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_f32 (const float32_t * __a)
 {
@@ -15648,6 +15735,18 @@ vld3q_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_f16 (const float16_t * __a)
+{
+  float16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_f32 (const float32_t * __a)
 {
@@ -15815,6 +15914,19 @@ vld4_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_f16 (const float16_t * __a)
+{
+  float16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
+  ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_f32 (const float32_t * __a)
 {
@@ -15958,6 +16070,19 @@ vld4q_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_f16 (const float16_t * __a)
+{
+  float16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
+  ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_f32 (const float32_t * __a)
 {
@@ -16019,6 +16144,18 @@ vld2_dup_s32 (const int32_t * __a)
   return ret;
 }
 
+
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_f16 (const float16_t * __a)
+{
+  float16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_dup_f32 (const float32_t * __a)
 {
@@ -16228,6 +16365,17 @@ vld2q_dup_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_dup_f16 (const float16_t * __a)
+{
+  float16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_dup_f32 (const float32_t * __a)
 {
@@ -16382,6 +16530,18 @@ vld3_dup_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_f16 (const float16_t * __a)
+{
+  float16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_dup_f32 (const float32_t * __a)
 {
@@ -16514,6 +16674,18 @@ vld3q_dup_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_dup_f16 (const float16_t * __a)
+{
+  float16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_dup_f32 (const float32_t * __a)
 {
@@ -16681,6 +16853,19 @@ vld4_dup_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_f16 (const float16_t * __a)
+{
+  float16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
+  ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_dup_f32 (const float32_t * __a)
 {
@@ -16824,6 +17009,19 @@ vld4q_dup_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_dup_f16 (const float16_t * __a)
+{
+  float16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
+  ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_dup_f32 (const float32_t * __a)
 {
@@ -16876,6 +17074,8 @@ vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
   return __b;								   \
 }
 
+__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v8hf,
+		 hf, f16, float16x8_t)
 __LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
 		 sf, f32, float32x4_t)
 __LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
@@ -16920,6 +17120,7 @@ vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
   return ret;								   \
 }
 
+__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
 __LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
 __LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
@@ -16967,6 +17168,8 @@ vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
   return __b;								   \
 }
 
+__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v8hf,
+		 hf, f16, float16x8_t)
 __LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
 		 sf, f32, float32x4_t)
 __LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
@@ -17013,6 +17216,7 @@ vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
   return ret;								   \
 }
 
+__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
 __LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
 __LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
@@ -17068,6 +17272,8 @@ vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
 
 /* vld4q_lane */
 
+__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v8hf,
+		 hf, f16, float16x8_t)
 __LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
 		 sf, f32, float32x4_t)
 __LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
@@ -17116,6 +17322,7 @@ vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
   return ret;								   \
 }
 
+__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
 __LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
 __LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
@@ -22474,6 +22681,18 @@ vst2_u32 (uint32_t * __a, uint32x2x2_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_f16 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st2v4hf (__a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2_f32 (float32_t * __a, float32x2x2_t val)
 {
   __builtin_aarch64_simd_oi __o;
@@ -22576,6 +22795,15 @@ vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_f16 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st2v8hf (__a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f32 (float32_t * __a, float32x4x2_t val)
 {
   __builtin_aarch64_simd_oi __o;
@@ -22748,6 +22976,20 @@ vst3_u32 (uint32_t * __a, uint32x2x3_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_f16 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3_f32 (float32_t * __a, float32x2x3_t val)
 {
   __builtin_aarch64_simd_ci __o;
@@ -22862,6 +23104,16 @@ vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_f16 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f32 (float32_t * __a, float32x4x3_t val)
 {
   __builtin_aarch64_simd_ci __o;
@@ -23058,6 +23310,22 @@ vst4_u32 (uint32_t * __a, uint32x2x4_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_f16 (float16_t * __a, float16x4x4_t val)
+{
+  __builtin_aarch64_simd_xi __o;
+  float16x8x4_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
+  __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4_f32 (float32_t * __a, float32x2x4_t val)
 {
   __builtin_aarch64_simd_xi __o;
@@ -23184,6 +23452,17 @@ vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_f16 (float16_t * __a, float16x8x4_t val)
+{
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
+  __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f32 (float32_t * __a, float32x4x4_t val)
 {
   __builtin_aarch64_simd_xi __o;
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 3858e9558f4a6a1f52d49d074da6203b73de26b4..deb67f6f16994ef54d04565e92248f6bc1896822 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -113,7 +113,7 @@
 
 ;; All vector modes and DI and DF.
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
-			       V2DI V2SF V4SF V2DF DI DF])
+			       V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
 
 ;; Vector modes for Integer reduction across lanes.
 (define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI])
@@ -134,7 +134,7 @@
 (define_mode_iterator VQW [V16QI V8HI V4SI])
 
 ;; Double vector modes for combines.
-(define_mode_iterator VDC [V8QI V4HI V2SI V2SF DI DF])
+(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF])
 
 ;; Vector modes except double int.
 (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
@@ -363,7 +363,8 @@
                          (V2SI "2s") (V4SI  "4s")
                          (DI   "1d") (DF    "1d")
                          (V2DI "2d") (V2SF "2s")
-			 (V4SF "4s") (V2DF "2d")])
+			 (V4SF "4s") (V2DF "2d")
+			 (V4HF "4h") (V8HF "8h")])
 
 (define_mode_attr Vrevsuff [(V4HI "16") (V8HI "16") (V2SI "32")
                             (V4SI "32") (V2DI "64")])
@@ -389,7 +390,8 @@
 (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
 			  (V4HI "h") (V8HI  "h")
                           (V2SI "s") (V4SI  "s")
-			  (V2DI "d") (V2SF  "s")
+			  (V2DI "d") (V4HF "h")
+			  (V8HF "h") (V2SF  "s")
 			  (V4SF "s") (V2DF  "d")
 			  (SF   "s") (DF  "d")
 			  (QI "b")   (HI "h")
@@ -399,7 +401,8 @@
 (define_mode_attr Vbtype [(V8QI "8b")  (V16QI "16b")
 			  (V4HI "8b") (V8HI  "16b")
 			  (V2SI "8b") (V4SI  "16b")
-			  (V2DI "16b") (V2SF  "8b")
+			  (V2DI "16b") (V4HF "8b")
+			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
 			  (SI   "8b")])
@@ -450,6 +453,7 @@
 
 ;; Double modes of vector modes (lower case).
 (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
+			(V4HF "v8hf")
 			(V2SI "v4si")  (V2SF "v4sf")
 			(SI   "v2si")  (DI   "v2di")
 			(DF   "v2df")])
@@ -524,6 +528,7 @@
 				(V4HI "V4HI") (V8HI  "V8HI")
 				(V2SI "V2SI") (V4SI  "V4SI")
 				(DI   "DI")   (V2DI  "V2DI")
+				(V4HF "V4HI") (V8HF  "V8HI")
 				(V2SF "V2SI") (V4SF  "V4SI")
 				(V2DF "V2DI") (DF    "DI")
 				(SF   "SI")])
@@ -533,6 +538,7 @@
 				(V4HI "v4hi") (V8HI  "v8hi")
 				(V2SI "v2si") (V4SI  "v4si")
 				(DI   "di")   (V2DI  "v2di")
+				(V4HF "v4hi") (V8HF  "v8hi")
 				(V2SF "v2si") (V4SF  "v4si")
 				(V2DF "v2di") (DF    "di")
 				(SF   "si")])
diff --git a/gcc/testsuite/gcc.target/aarch64/vldN_1.c b/gcc/testsuite/gcc.target/aarch64/vldN_1.c
index b64de16a1658da166175288bc1378a57d220801f..caac94f86cef77eccd0cdfd6ace846058bf2e25b 100644
--- a/gcc/testsuite/gcc.target/aarch64/vldN_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vldN_1.c
@@ -39,6 +39,7 @@ VARIANT (int32, 2, STRUCT, _s32)	\
 VARIANT (int64, 1, STRUCT, _s64)	\
 VARIANT (poly8, 8, STRUCT, _p8)		\
 VARIANT (poly16, 4, STRUCT, _p16)	\
+VARIANT (float16, 4, STRUCT, _f16)	\
 VARIANT (float32, 2, STRUCT, _f32)	\
 VARIANT (float64, 1, STRUCT, _f64)	\
 VARIANT (uint8, 16, STRUCT, q_u8)	\
@@ -51,6 +52,7 @@ VARIANT (int32, 4, STRUCT, q_s32)	\
 VARIANT (int64, 2, STRUCT, q_s64)	\
 VARIANT (poly8, 16, STRUCT, q_p8)	\
 VARIANT (poly16, 8, STRUCT, q_p16)	\
+VARIANT (float16, 8, STRUCT, q_f16)	\
 VARIANT (float32, 4, STRUCT, q_f32)	\
 VARIANT (float64, 2, STRUCT, q_f64)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c b/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c
index 9af0565d617b027f7b2551bb27b6d75fe4c81d5d..68c3fc34f5a037255be95334126a63feb875196b 100644
--- a/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c
@@ -16,6 +16,7 @@ VARIANT (int32, , 2, _s32, STRUCT)	\
 VARIANT (int64, , 1, _s64, STRUCT)	\
 VARIANT (poly8, , 8, _p8, STRUCT)	\
 VARIANT (poly16, , 4, _p16, STRUCT)	\
+VARIANT (float16, , 4, _f16, STRUCT)	\
 VARIANT (float32, , 2, _f32, STRUCT)	\
 VARIANT (float64, , 1, _f64, STRUCT)	\
 VARIANT (uint8, q, 16, _u8, STRUCT)	\
@@ -28,6 +29,7 @@ VARIANT (int32, q, 4, _s32, STRUCT)	\
 VARIANT (int64, q, 2, _s64, STRUCT)	\
 VARIANT (poly8, q, 16, _p8, STRUCT)	\
 VARIANT (poly16, q, 8, _p16, STRUCT)	\
+VARIANT (float16, q, 8, _f16, STRUCT)	\
 VARIANT (float32, q, 4, _f32, STRUCT)	\
 VARIANT (float64, q, 2, _f64, STRUCT)
 
@@ -74,6 +76,7 @@ main (int argc, char **argv)
   int64_t *int64_data = (int64_t *)uint64_data;
   poly8_t poly8_data[4] = { 0, 7, 13, 18, };
   poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
+  float16_t float16_data[4] = { 1.0625, 3.125, 0.03125, 7.75 };
   float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c
index e450b7b2b961db56acf5ef5b88e0dc185e81e754..7adc172eede0c5ff3554e7452aa302761c60f700 100644
--- a/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c
@@ -16,6 +16,7 @@ VARIANT (int32, , 2, _s32, 0, STRUCT)	\
 VARIANT (int64, , 1, _s64, 0, STRUCT)	\
 VARIANT (poly8, , 8, _p8, 7, STRUCT)	\
 VARIANT (poly16, , 4, _p16, 1, STRUCT)	\
+VARIANT (float16, , 4, _f16, 3, STRUCT)	\
 VARIANT (float32, , 2, _f32, 1, STRUCT)	\
 VARIANT (float64, , 1, _f64, 0, STRUCT)	\
 VARIANT (uint8, q, 16, _u8, 14, STRUCT)	\
@@ -28,6 +29,7 @@ VARIANT (int32, q, 4, _s32, 2, STRUCT)	\
 VARIANT (int64, q, 2, _s64, 1, STRUCT)	\
 VARIANT (poly8, q, 16, _p8, 12, STRUCT)	\
 VARIANT (poly16, q, 8, _p16, 5, STRUCT)	\
+VARIANT (float16, q, 8, _f16, 7, STRUCT)\
 VARIANT (float32, q, 4, _f32, 1, STRUCT)\
 VARIANT (float64, q, 2, _f64, 0, STRUCT)
 
@@ -71,7 +73,7 @@ main (int argc, char **argv)
 {
   /* Original data for all vector formats.  */
   uint64_t orig_data[8] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL,
-			   0x012389ab4567cdefULL, 0xfeeddadacafe0431ULL,
+			   0x012389ab4567cdefULL, 0xdeeddadacafe0431ULL,
 			   0x1032547698badcfeULL, 0xbadbadbadbad0badULL,
 			   0x0102030405060708ULL, 0x0f0e0d0c0b0a0908ULL};
 
@@ -87,6 +89,7 @@ main (int argc, char **argv)
   int64_t *int64_data = (int64_t *)uint64_data;
   poly8_t poly8_data[4] = { 0, 7, 13, 18, };
   poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
+  float16_t float16_data[4] = { 0.8125, 7.5, 19, 0.046875 };
   float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/14][AArch64] Add basic fp16 support
  2015-04-22 17:06 ` [PATCH 5/14][AArch64] Add basic fp16 support Alan Lawrence
@ 2015-04-29 21:22   ` Joseph Myers
  2015-05-08 12:55     ` Alan Lawrence
  0 siblings, 1 reply; 28+ messages in thread
From: Joseph Myers @ 2015-04-29 21:22 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On Wed, 22 Apr 2015, Alan Lawrence wrote:

> [Resending with correct in-reply-to header]
> 
> This adds basic support for moving __fp16 values around, passing and
> returning, and operating on them by promoting to 32-bit floats. Also a few
> scalar testcases.

I'd think it would be desirable to share tests between ARM and AArch64 as 
far as possible (where applicable to both - so not the tests for the 
alternative format, and some of the gcc.target/arm/fp16-* tests using 
scan-assembler might need adapting to work for AArch64).

To the extent that the ARM implementation follows an old specification and 
AArch64 is following newer ACLE, of course, this might require ARM to be 
updated to follow newer ACLE before tests can be shared.  (For example, 
the older specification implemented for ARM includes double rounding when 
converting from double to fp16, but ACLE specifies single rounding.)

Longer term, I'd hope the aim would be for semantics to follow TS 18661-3 
(DTS ballot recently passed), using the standard name _Float16 (and 
FLT_EVAL_METHOD == 32 as the nearest equivalent of the promotions to 
float).

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/14][AArch64] Add basic fp16 support
  2015-04-29 21:22   ` Joseph Myers
@ 2015-05-08 12:55     ` Alan Lawrence
  0 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-05-08 12:55 UTC (permalink / raw)
  To: Joseph Myers; +Cc: gcc-patches, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 951 bytes --]


Joseph Myers wrote:
 >
> I'd think it would be desirable to share tests between ARM and AArch64 as 
> far as possible (where applicable to both - so not the tests for the 
> alternative format, and some of the gcc.target/arm/fp16-* tests using 
> scan-assembler might need adapting to work for AArch64).

I agree the most desirable outcome is for the ACLE spec to be normalized between 
the two architectures! In the meantime this implements the specification that we 
have...

I attach a new patch that adds common ARM / AArch64 tests in 
gcc.target/aarch64/fp16 (i.e. beside the shared 
gcc.target/aarch64/advsimd-intrinsics). I've adapted two of the previous tests 
such that they pass on both ARM and AArch64. I'd like to propose this as a patch 
5a, and to drop those tests from the original patch 5.

I'll follow-up with a sort-through of the ARM tests, moving only those that can 
be shared, in due course.

Cheers, Alan

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 05a_common_tests.patch --]
[-- Type: text/x-patch; name=05a_common_tests.patch, Size: 4267 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_1.c b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1c95fd28d14668c5cfa9cfb419c945878d7ac2b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_1.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-mfp16-format=ieee" {target "arm*-*-*"} } */
+
+extern void abort (void);
+
+#define EPSILON 0.0001
+
+int
+main (int argc, char **argv)
+{
+  float f1 = 3.14159f;
+  float f2 = 2.718f;
+  /* This 'assembler' statement should be portable between ARM and AArch64.  */
+  asm volatile ("" : : : "memory");
+  __fp16 in1 = f1;
+  __fp16 in2 = f2;
+
+  /* Do the addition on __fp16's (implicitly converts both operands to
+     float32, adds, converts back to f16, then we convert back to f32).  */
+  __fp16 res1 = in1 + in2;
+  asm volatile ("" : : : "memory");
+  float f_res_1 = res1;
+
+  /* Do the addition on float32's (we convert both operands to f32, and add,
+     as above, but skip the final conversion f32 -> f16 -> f32).  */
+  float f1a = in1;
+  float f2a = in2;
+  float f_res_2 = f1a + f2a;
+
+  if (__builtin_fabs (f_res_2 - f_res_1) > EPSILON)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_2.c b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..6aa3e59c15e0eb85595871b47e8d8aa937cca47e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_2.c
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-mfp16-format=ieee" {target "arm*-*-*"} } */
+
+extern void abort (void);
+
+#define EPSILON 0.0001
+
+int
+main (int argc, char **argv)
+{
+  int i1 = 3;
+  int i2 = 2;
+  /*  This 'assembler' should be portable across ARM and AArch64.  */
+  asm volatile ("" : : : "memory");
+
+  __fp16 in1 = i1;
+  __fp16 in2 = i2;
+
+  /* Do the addition on __fp16's (implicitly converts both operands to
+     float32, adds, converts back to f16, then we convert to int).  */
+  __fp16 res1 = in1 + in2;
+  asm volatile ("" : : : "memory");
+  int result1 = res1;
+
+  /* Do the addition on int's (we convert both operands directly to int, add,
+     and we're done).  */
+  int result2 = ((int) in1) + ((int) in2);
+
+  if (__builtin_abs (result2 - result1) > EPSILON)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/fp16.exp b/gcc/testsuite/gcc.target/aarch64/fp16/fp16.exp
new file mode 100644
index 0000000000000000000000000000000000000000..7dc8d654a34004d280a1e9f6b9f39d868a60464a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16/fp16.exp
@@ -0,0 +1,43 @@
+# Tests of 16-bit floating point (__fp16), for both ARM and AArch64.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an ARM or AArch64 target.
+if {![istarget arm*-*-*]
+    && ![istarget aarch64*-*-*]} then {
+  return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
+}
+
+# Initialize `dg'.
+dg-init
+
+# Main loop.
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cC\]]] \
+	"" $DEFAULT_CFLAGS
+
+# All done.
+dg-finish

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite
  2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
                   ` (13 preceding siblings ...)
  2015-04-22 17:38 ` [PATCH 14/14][ARM/AArch64 testsuite] Test float16_t vcvt_* intrinsics Alan Lawrence
@ 2015-05-08 12:56 ` Alan Lawrence
  14 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-05-08 12:56 UTC (permalink / raw)
  To: gcc-patches, Marcus Shawcroft, Kyrylo Tkachov

Alan Lawrence wrote:
> This patch series adds support for ARM Neon float16x4_t and float16x8_t vector 
> types and intrinsics, and the __fp16 type, on both ARM and AArch64, and extends 
> the tests in Christophe Lyon's advsimd-intrinsics testsuite to cover these. (I 
> chose to extend the existing tests rather than add new ones, as the majority of 
> f16 intrinsics are just moving blocks of 16-bits around and do not depend on HW 
> support; I added new files for the conversion intrinsics.)
> 
> The ARM parts were previously posted at 
> https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01434.html but have had some fixes 
> following the testsuite additions. Also The ARM patches depend upon my ARM 
> lane-checking improvements at 
> https://gcc.gnu.org/ml/gcc-patches/2015-01/msg01422.html , which I have just pinged.
> 
> I've cross-tested baremetal arm-none-eabi, aarch64-none-elf and 
> aarch64_be-none-elf most patches individually, and bootstrapped each patch in 
> series on (the relevant one of) arm-none-linux-gnueabihf and aarch64-none-linux-gnu.
> 
> OK for trunk?
> 
> Cheers, Alan
> 
> 

Ping (ARM, AArch64, Testsuite).

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 12/14][ARM/AArch64 Testsuite] Update advsimd-intrinsics tests to add float16 vectors
  2015-04-22 17:31 ` [PATCH 12/14][ARM/AArch64 Testsuite] Update advsimd-intrinsics tests to add float16 vectors Alan Lawrence
@ 2015-05-25 12:37   ` Christophe Lyon
  0 siblings, 0 replies; 28+ messages in thread
From: Christophe Lyon @ 2015-05-25 12:37 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 22 April 2015 at 19:31, Alan Lawrence <alan.lawrence@arm.com> wrote:
> This is a fairly straightforward addition of a new type: I've added it in on
> equal status to the other types, because the various
> vector-load/store/element-manipulating intrinsics, are *not* conditional on
> HW support. (They just involve moving 16-bit chunks around, just like
> s16/u16/p16).
>
> Thus, for many tests, this just involves adding default "expected" values of
> { 0x3333 ...}. While there are indeed more of such "default" values for
> float16x4/8 than any other type (because there are fewer intrinsics), there
> are plenty others, so this seems consistent. However, there is no vdup_n_f16
> intrinsic so I worked around this using a macro (yes, a bit ugh).

I have recently checked-in a cleanup patch to remove all these default
"expected values" of 0x3333, so you'll have to rework your patch.
Sorry for not catching this earlier.

Christophe.

>
> There are many check_GNU_style.sh violations here but I tried to be
> consistent with the existing code.
>
> Passing on arm-none-linux-gnueabihf and aarch64-none-linux-gnu,
> aarch64_be-none-elf (following previous patch)
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h (hfloat16_t,
>         vdup_n_f16): New.
>         (result, expected, CHECK_RESULTS, CHECK_RESULTS_NAMED,
> clean_results):
>         Add float16x4 and float16x8 cases.
>
>         DECL_VARIABLE_64BITS_VARIANTS: Add float16x4 case.
>         DECL_VARIABLE_128BITS_VARIANTS: Add float16x8 case.
>
>         * gcc.target/aarch64/advsimd-intrinsics/compute-data-ref.h (buffer,
>         buffer_pad, buffer_dup, buffer_dup_pad): Add float16x4 and
> float16x8.
>
>         * gcc.target/aarch64/advsimd-intrinsics/vaba.c: Add expected results
>         for float16x4 and float16x8.
>         * gcc.target/aarch64/advsimd-intrinsics/vabal.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vabd.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vabdl.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vabs.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vadd.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vaddl.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vaddw.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vand.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vbic.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vbsl.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vcls.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vclz.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vcnt.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c: Likewise.
>
>         * gcc.target/aarch64/advsimd-intrinsics/vcombine.c: Add expected
>         results for float16x4 and float16x8.
>         (main): add test of float16x4 -> float16x8 case.
>         * gcc.target/aarch64/advsimd-intrinsics/vcreate.c: Likewise.
>         * gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c: Likewise.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-04-22 17:36 ` [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp Alan Lawrence
@ 2015-05-25 12:43   ` Christophe Lyon
  2015-05-26 16:48     ` Alan Lawrence
  0 siblings, 1 reply; 28+ messages in thread
From: Christophe Lyon @ 2015-05-25 12:43 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 22 April 2015 at 19:36, Alan Lawrence <alan.lawrence@arm.com> wrote:
> In the first revision of Christophe Lyon's advsimd-intrinsics tests,
> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg00532.html , both
> gcc-dg-runtest (to assemble only) and c-torture-execute were used. In review
> the gcc-dg-runtest part was then dropped, and execution tests continued
> using c-torture-execute. However, c-torture-execute ignores e.g. dg-options
> directives in the individual test files, whereas gcc-dg-runtest does not.
>
> This patch switches to gcc-dg-runtest (with dg-do-what-default = "run") for
> all tests, allowing use of e.g. dg-options (in testsuite patch 3/3). This

Sandra has recently committed a slightly different patch.

If you want to update your, here are few comments/questions:
- why do you add "-w" to additional_flags?
- you changed the way we iterate over the tests, but this removes the
possiblity to actually execute only a subset of the available tests,
such as RUNTESTFLAGS=advsimd-intrinsics.exp=vadd.c

Christophe.

> generally seems to work OK - indeed I also dropped the
> parallelization-disabling code - and the new advsimd-intrinsics.exp now
> follows gcc.c-torture/compile/compile.exp and
> gcc.c-torture/execute/execute.exp very closely. However, there are side
> effects, of which we should be aware, but with which I think we can live,
> specifically:
>
> (1) the lines in the test log change from...
>
> PASS: gcc.target/aarch64/advsimd-intrinsics/vcombine.c compilation,  -O1
> PASS: gcc.target/aarch64/advsimd-intrinsics/vcombine.c execution,  -O1
>
> ...to...
>
> PASS: gcc.target/aarch64/advsimd-intrinsics/vcombine.c   -O1  execution test
>
> (that is, the compilation line disappears, but the (test for excess errors)
> remains unchanged)
>
> (2) The "-Og -g" variant is no longer tested;  all of -O0, -O1, -O2, -O2
> -flto -fno-use-linker-plugin -flto-partition=none, -O2 -flto
> -fuse-linker-plugin -fno-fat-lto-objects, -O3 -fomit-frame-pointer, -O3 -g,
> -Os still are. My feeling is that this set of options is exhaustive enough.
>
> Cross-tested arm-none-eabi, aarch64-none-elf, aarch64_be-none-elf; natively
> tested arm-none-linux-gnueabihf and aarch64-none-linux-gnu.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp:
>         Use gcc-dg-runtest.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 14/14][ARM/AArch64 testsuite] Test float16_t vcvt_* intrinsics
  2015-04-22 17:38 ` [PATCH 14/14][ARM/AArch64 testsuite] Test float16_t vcvt_* intrinsics Alan Lawrence
@ 2015-05-25 13:31   ` Christophe Lyon
  0 siblings, 0 replies; 28+ messages in thread
From: Christophe Lyon @ 2015-05-25 13:31 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 22 April 2015 at 19:38, Alan Lawrence <alan.lawrence@arm.com> wrote:
> This adds a test of vcvt_f32_f16 and vcvt_f16_f32, also vcvt_high_f32_f16
> and vcvt_high_f16_f32.
>
> On ARM, we pass additional option -mfpu=neon-fp16 to the compiler (possible
> following patch 2/3). The compiler is already receiving an option such as
> -mfpu=neon or -mfpu=crypto-neon-fp-armv8, but passing neon-fp16 as well as
> either of those appears to do no harm, and turns on the superset of all
> -mfpu options, as desired.
>
> On AArch64, we additionally test vcvt_high_f32_f16 and vcvt_high_f16_f32;
> these are not tested on ARM as the relevant intrinsics do not exist in
> 32-bit state.
>
> Passing on aarch64_be-none-elf, aarch64-none-elf, arm-none-linux-gnueabi,
> aarch64-none-linux-gnu.
>
> gcc/testsuite/ChangeLog:
>          * gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c: New.

You should call clean_results() once more, before the first tests, to
make sure the 'results*' array are initialized as expected.

Other AArch64-specific tests in the same dir use #if
defined(__aarch64__) while you use #ifdef __ARM_64BIT_STATE.
Any reason to prefer the latter?

Christophe.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-05-25 12:43   ` Christophe Lyon
@ 2015-05-26 16:48     ` Alan Lawrence
  2015-05-26 19:12       ` Christophe Lyon
  0 siblings, 1 reply; 28+ messages in thread
From: Alan Lawrence @ 2015-05-26 16:48 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

Christophe Lyon wrote:
> On 22 April 2015 at 19:36, Alan Lawrence <alan.lawrence@arm.com> wrote:
>> In the first revision of Christophe Lyon's advsimd-intrinsics tests,
>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg00532.html , both
>> gcc-dg-runtest (to assemble only) and c-torture-execute were used. In review
>> the gcc-dg-runtest part was then dropped, and execution tests continued
>> using c-torture-execute. However, c-torture-execute ignores e.g. dg-options
>> directives in the individual test files, whereas gcc-dg-runtest does not.
>>
>> This patch switches to gcc-dg-runtest (with dg-do-what-default = "run") for
>> all tests, allowing use of e.g. dg-options (in testsuite patch 3/3). This
> 
> Sandra has recently committed a slightly different patch.
> 
> If you want to update your, here are few comments/questions:
> - why do you add "-w" to additional_flags?

Hmmm. Not sure now. I agree, it appears to work without, so will drop that.

> - you changed the way we iterate over the tests, but this removes the
> possiblity to actually execute only a subset of the available tests,
> such as RUNTESTFLAGS=advsimd-intrinsics.exp=vadd.c

I don't see this symptom - I am able to execute such subsets with either my, or 
Sandra's, advsimd-intrinsics.exp.

Is it that you have to check runtest_file_p because you are setting 
gcc_parallel_test_enable to 0?

I'm doing more testing now, but I think I can drop my advsimd-intrinsics.exp 
changes altogether; I'll post an updated patch series shortly.

In the meantime I'm curious as to why you found the gcc_parallel_test_enable 
necessary? (And is it safe to reset it to 1 afterwards, rather than to a saved 
value?)

TYVM for your other comments and review - I will incorporate all into my next 
revision.

Thanks, Alan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-05-26 16:48     ` Alan Lawrence
@ 2015-05-26 19:12       ` Christophe Lyon
  2015-05-28 10:50         ` Alan Lawrence
  0 siblings, 1 reply; 28+ messages in thread
From: Christophe Lyon @ 2015-05-26 19:12 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 26 May 2015 at 18:25, Alan Lawrence <alan.lawrence@arm.com> wrote:
> Christophe Lyon wrote:
>>
>> On 22 April 2015 at 19:36, Alan Lawrence <alan.lawrence@arm.com> wrote:
>>>
>>> In the first revision of Christophe Lyon's advsimd-intrinsics tests,
>>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg00532.html , both
>>> gcc-dg-runtest (to assemble only) and c-torture-execute were used. In
>>> review
>>> the gcc-dg-runtest part was then dropped, and execution tests continued
>>> using c-torture-execute. However, c-torture-execute ignores e.g.
>>> dg-options
>>> directives in the individual test files, whereas gcc-dg-runtest does not.
>>>
>>> This patch switches to gcc-dg-runtest (with dg-do-what-default = "run")
>>> for
>>> all tests, allowing use of e.g. dg-options (in testsuite patch 3/3). This
>>
>>
>> Sandra has recently committed a slightly different patch.
>>
>> If you want to update your, here are few comments/questions:
>> - why do you add "-w" to additional_flags?
>
>
> Hmmm. Not sure now. I agree, it appears to work without, so will drop that.
>
>> - you changed the way we iterate over the tests, but this removes the
>> possiblity to actually execute only a subset of the available tests,
>> such as RUNTESTFLAGS=advsimd-intrinsics.exp=vadd.c
>
>
> I don't see this symptom - I am able to execute such subsets with either my,
> or Sandra's, advsimd-intrinsics.exp.

I didn't try to run with your patch, I thought it was an oversight of yours.

Sorry, indeed I've just checked that gcc-dg-runtest includes the filter.

>
> Is it that you have to check runtest_file_p because you are setting
> gcc_parallel_test_enable to 0?
>
> I'm doing more testing now, but I think I can drop my advsimd-intrinsics.exp
> changes altogether; I'll post an updated patch series shortly.
>
> In the meantime I'm curious as to why you found the gcc_parallel_test_enable
> necessary? (And is it safe to reset it to 1 afterwards, rather than to a
> saved value?)
See https://gcc.gnu.org/ml/gcc/2014-10/msg00081.html

>
> TYVM for your other comments and review - I will incorporate all into my
> next revision.

Thanks.

>
> Thanks, Alan
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-05-26 19:12       ` Christophe Lyon
@ 2015-05-28 10:50         ` Alan Lawrence
  2015-05-28 11:53           ` Christophe Lyon
  0 siblings, 1 reply; 28+ messages in thread
From: Alan Lawrence @ 2015-05-28 10:50 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

Christophe Lyon wrote:
> On 26 May 2015 at 18:25, Alan Lawrence <alan.lawrence@arm.com> wrote:
>> I don't see this symptom - I am able to execute such subsets with either my,
>> or Sandra's, advsimd-intrinsics.exp.
> 
> I didn't try to run with your patch, I thought it was an oversight of yours.
> 
> Sorry, indeed I've just checked that gcc-dg-runtest includes the filter.
> 
>> Is it that you have to check runtest_file_p because you are setting
>> gcc_parallel_test_enable to 0?
>>
>> I'm doing more testing now, but I think I can drop my advsimd-intrinsics.exp
>> changes altogether; I'll post an updated patch series shortly.
>>
>> In the meantime I'm curious as to why you found the gcc_parallel_test_enable
>> necessary? (And is it safe to reset it to 1 afterwards, rather than to a
>> saved value?)
> See https://gcc.gnu.org/ml/gcc/2014-10/msg00081.html

So after working through the differences between Sandra's and my patch, I find 
the existing advsimd-intrinsics.exp achieves pretty much the same thing, and 
preserves the same list of test variants (e.g. the -Og -g from 
set-torture-options which I had removed).

However, I've tried testing advsimd-intrinsics.exp (both the whole thing, and 
individual tests using RUNTESTFLAGS) with and without this hunk:

@@ -57,20 +57,7 @@ set-torture-options $C_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTI
  set additional_flags [add_options_for_arm_neon ""]

  # Main loop.
-foreach src [lsort [glob -nocomplain $srcdir/$subdir/*.c]] {
-    # If we're only testing specific files and this isn't one of them, skip it.
-    if ![runtest_file_p $runtests $src] then {
-       continue
-    }
-
-    # runtest_file_p is already run above, and the code below can run
-    # runtest_file_p again, make sure everything for this test is
-    # performed if the above runtest_file_p decided this runtest
-    # instance should execute the test
-    gcc_parallel_test_enable 0
-    gcc-dg-runtest $src "" $additional_flags
-    gcc_parallel_test_enable 1
-}
+gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] "" 
${additional_flags}

and find exactly the same tests are run and pass. My hypothesis is thus that you 
only need the explicit loop, manual checking of runtest_file_p, and 
gcc_parallel_test_enable, in order to do *both* c-torture-execute *and* 
gcc-dg-runtest; since we are now only doing the latter, this is unnecessary. 
Does that make sense? (If you agree, I'll propose that as a standalone cleanup 
patch.)

Cheers, Alan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-05-28 10:50         ` Alan Lawrence
@ 2015-05-28 11:53           ` Christophe Lyon
  2015-05-28 12:33             ` Christophe Lyon
  0 siblings, 1 reply; 28+ messages in thread
From: Christophe Lyon @ 2015-05-28 11:53 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 28 May 2015 at 12:22, Alan Lawrence <alan.lawrence@arm.com> wrote:
> Christophe Lyon wrote:
>>
>> On 26 May 2015 at 18:25, Alan Lawrence <alan.lawrence@arm.com> wrote:
>>>
>>> I don't see this symptom - I am able to execute such subsets with either
>>> my,
>>> or Sandra's, advsimd-intrinsics.exp.
>>
>>
>> I didn't try to run with your patch, I thought it was an oversight of
>> yours.
>>
>> Sorry, indeed I've just checked that gcc-dg-runtest includes the filter.
>>
>>> Is it that you have to check runtest_file_p because you are setting
>>> gcc_parallel_test_enable to 0?
>>>
>>> I'm doing more testing now, but I think I can drop my
>>> advsimd-intrinsics.exp
>>> changes altogether; I'll post an updated patch series shortly.
>>>
>>> In the meantime I'm curious as to why you found the
>>> gcc_parallel_test_enable
>>> necessary? (And is it safe to reset it to 1 afterwards, rather than to a
>>> saved value?)
>>
>> See https://gcc.gnu.org/ml/gcc/2014-10/msg00081.html
>
>
> So after working through the differences between Sandra's and my patch, I
> find the existing advsimd-intrinsics.exp achieves pretty much the same
> thing, and preserves the same list of test variants (e.g. the -Og -g from
> set-torture-options which I had removed).
>
> However, I've tried testing advsimd-intrinsics.exp (both the whole thing,
> and individual tests using RUNTESTFLAGS) with and without this hunk:
>
> @@ -57,20 +57,7 @@ set-torture-options $C_TORTURE_OPTIONS {{}}
> $LTO_TORTURE_OPTI
>  set additional_flags [add_options_for_arm_neon ""]
>
>  # Main loop.
> -foreach src [lsort [glob -nocomplain $srcdir/$subdir/*.c]] {
> -    # If we're only testing specific files and this isn't one of them, skip
> it.
> -    if ![runtest_file_p $runtests $src] then {
> -       continue
> -    }
> -
> -    # runtest_file_p is already run above, and the code below can run
> -    # runtest_file_p again, make sure everything for this test is
> -    # performed if the above runtest_file_p decided this runtest
> -    # instance should execute the test
> -    gcc_parallel_test_enable 0
> -    gcc-dg-runtest $src "" $additional_flags
> -    gcc_parallel_test_enable 1
> -}
> +gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] ""
> ${additional_flags}
>
> and find exactly the same tests are run and pass. My hypothesis is thus that
> you only need the explicit loop, manual checking of runtest_file_p, and
> gcc_parallel_test_enable, in order to do *both* c-torture-execute *and*
> gcc-dg-runtest; since we are now only doing the latter, this is unnecessary.
> Does that make sense? (If you agree, I'll propose that as a standalone
> cleanup patch.)
>

Indeed I think you are right. Since we no longer call
c-torture-execute, we no longer need to call runtest_file_p here.
Having only one remaining call to runtest_file_p in gcc-dg-runtest is
parallel-safe. Thanks for the cleanup.

Christophe.

> Cheers, Alan
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-05-28 11:53           ` Christophe Lyon
@ 2015-05-28 12:33             ` Christophe Lyon
  2015-05-28 13:57               ` Alan Lawrence
  0 siblings, 1 reply; 28+ messages in thread
From: Christophe Lyon @ 2015-05-28 12:33 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 28 May 2015 at 13:32, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> On 28 May 2015 at 12:22, Alan Lawrence <alan.lawrence@arm.com> wrote:
>> Christophe Lyon wrote:
>>>
>>> On 26 May 2015 at 18:25, Alan Lawrence <alan.lawrence@arm.com> wrote:
>>>>
>>>> I don't see this symptom - I am able to execute such subsets with either
>>>> my,
>>>> or Sandra's, advsimd-intrinsics.exp.
>>>
>>>
>>> I didn't try to run with your patch, I thought it was an oversight of
>>> yours.
>>>
>>> Sorry, indeed I've just checked that gcc-dg-runtest includes the filter.
>>>
>>>> Is it that you have to check runtest_file_p because you are setting
>>>> gcc_parallel_test_enable to 0?
>>>>
>>>> I'm doing more testing now, but I think I can drop my
>>>> advsimd-intrinsics.exp
>>>> changes altogether; I'll post an updated patch series shortly.
>>>>
>>>> In the meantime I'm curious as to why you found the
>>>> gcc_parallel_test_enable
>>>> necessary? (And is it safe to reset it to 1 afterwards, rather than to a
>>>> saved value?)
>>>
>>> See https://gcc.gnu.org/ml/gcc/2014-10/msg00081.html
>>
>>
>> So after working through the differences between Sandra's and my patch, I
>> find the existing advsimd-intrinsics.exp achieves pretty much the same
>> thing, and preserves the same list of test variants (e.g. the -Og -g from
>> set-torture-options which I had removed).
>>
>> However, I've tried testing advsimd-intrinsics.exp (both the whole thing,
>> and individual tests using RUNTESTFLAGS) with and without this hunk:
>>
>> @@ -57,20 +57,7 @@ set-torture-options $C_TORTURE_OPTIONS {{}}
>> $LTO_TORTURE_OPTI
>>  set additional_flags [add_options_for_arm_neon ""]
>>
>>  # Main loop.
>> -foreach src [lsort [glob -nocomplain $srcdir/$subdir/*.c]] {
>> -    # If we're only testing specific files and this isn't one of them, skip
>> it.
>> -    if ![runtest_file_p $runtests $src] then {
>> -       continue
>> -    }
>> -
>> -    # runtest_file_p is already run above, and the code below can run
>> -    # runtest_file_p again, make sure everything for this test is
>> -    # performed if the above runtest_file_p decided this runtest
>> -    # instance should execute the test
>> -    gcc_parallel_test_enable 0
>> -    gcc-dg-runtest $src "" $additional_flags
>> -    gcc_parallel_test_enable 1
>> -}
>> +gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] ""
>> ${additional_flags}
>>
>> and find exactly the same tests are run and pass. My hypothesis is thus that
>> you only need the explicit loop, manual checking of runtest_file_p, and
>> gcc_parallel_test_enable, in order to do *both* c-torture-execute *and*
>> gcc-dg-runtest; since we are now only doing the latter, this is unnecessary.
>> Does that make sense? (If you agree, I'll propose that as a standalone
>> cleanup patch.)
>>
>
> Indeed I think you are right. Since we no longer call
> c-torture-execute, we no longer need to call runtest_file_p here.
> Having only one remaining call to runtest_file_p in gcc-dg-runtest is
> parallel-safe. Thanks for the cleanup.
>

So in fact, except for the comment about '-w' it seems you initial
patch was mostly OK, right?

> Christophe.
>
>> Cheers, Alan
>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp
  2015-05-28 12:33             ` Christophe Lyon
@ 2015-05-28 13:57               ` Alan Lawrence
  0 siblings, 0 replies; 28+ messages in thread
From: Alan Lawrence @ 2015-05-28 13:57 UTC (permalink / raw)
  Cc: gcc-patches

Christophe Lyon wrote:
> 
> So in fact, except for the comment about '-w' it seems you initial
> patch was mostly OK, right?
> 


Well, my removing a bunch of that c-torture-init stuff, was what was causing the 
"-Og -g" variant to go missing, but apart from that, yes.

--Alan

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2015-05-28 13:48 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-22 16:54 [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence
2015-04-22 16:58 ` [PATCH 1/14][ARM] Add float16x4_t intrinsics Alan Lawrence
2015-04-22 16:59 ` [PATCH 2/14][ARM]Add float16x8_t type Alan Lawrence
2015-04-22 16:59 ` [PATCH 3/14][ARM] Add float16x8_t intrinsics Alan Lawrence
2015-04-22 16:59 ` [PATCH 4/14][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low|high, vcombine Alan Lawrence
2015-04-22 17:06 ` [PATCH 5/14][AArch64] Add basic fp16 support Alan Lawrence
2015-04-29 21:22   ` Joseph Myers
2015-05-08 12:55     ` Alan Lawrence
2015-04-22 17:11 ` [PATCH 6/14][AArch64] Add support for float16x{4,8}_t vectors/builtins Alan Lawrence
2015-04-22 17:14 ` [PATCH 7/14][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate Alan Lawrence
2015-04-22 17:40   ` Alan Lawrence
2015-04-22 17:16 ` [PATCH 8/14][AArch64]Add vreinterpret, float_truncate_lo/hi, vget_low/high Alan Lawrence
2015-04-22 17:19 ` [PATCH 9/14][AArch64] vld1(q?)_dup, missing vreinterpretq intrinsics Alan Lawrence
2015-04-22 17:21 ` [PATCH 10/14][AArch64] Add vcvt(_high)?_f32_f16 intrinsics Alan Lawrence
2015-04-22 17:31 ` [PATCH 11/14][fold-const.c] Fix bigendian HFmode in native_interpret_real Alan Lawrence
2015-04-22 17:31 ` [PATCH 12/14][ARM/AArch64 Testsuite] Update advsimd-intrinsics tests to add float16 vectors Alan Lawrence
2015-05-25 12:37   ` Christophe Lyon
2015-04-22 17:36 ` [PATCH 13/14][ARM/AArch64 testsuite] Use gcc-dg-runtest in advsimd-intrinsics.exp Alan Lawrence
2015-05-25 12:43   ` Christophe Lyon
2015-05-26 16:48     ` Alan Lawrence
2015-05-26 19:12       ` Christophe Lyon
2015-05-28 10:50         ` Alan Lawrence
2015-05-28 11:53           ` Christophe Lyon
2015-05-28 12:33             ` Christophe Lyon
2015-05-28 13:57               ` Alan Lawrence
2015-04-22 17:38 ` [PATCH 14/14][ARM/AArch64 testsuite] Test float16_t vcvt_* intrinsics Alan Lawrence
2015-05-25 13:31   ` Christophe Lyon
2015-05-08 12:56 ` [PATCH 0/14][ARM/AArch64] __FP16 support, vectors, intrinsics, testsuite Alan Lawrence

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).