[PATCH 0/16][ARM/AArch64] Float16

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 0/16][ARM/AArch64] Float16_t support, v2
@ 2015-07-07 12:32 Alan Lawrence
  2015-07-07 12:34 ` [PATCH 3/16][ARM] Add float16x4_t intrinsics Alan Lawrence
                   ` (15 more replies)
  0 siblings, 16 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:32 UTC (permalink / raw)
  To: gcc-patches; +Cc: Charles Baylis

This is a respin of the series at 
https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01332.html, plus the two ARM 
patches on which these depend 
(https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01333.html). These two somewhat 
duplicate Charles Baylis' lane-bounds-checking patch at 
https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00140.html, in that they both port 
some of the same AArch64 infrastructure onto ARM; while each has some parts the 
other doesn't, there don't look to be any serious conflicts; if Charles' patches 
were to go in first, I would not expect any major problems in rebasing mine over 
his.

Changes since the first version of the float16 series are

* to separate out the (non-vector) tests from gcc.testsuite/aarch64 into a 
.../fp16 subdirectory, as per 
https://gcc.gnu.org/ml/gcc-patches/2015-05/msg00656.html

* dropped the patch rewriting advsimd-intrinsics.exp, following other changes 
along similar lines by Sandra Loosemore and Christopher Lyon;

* Rebased over other testsuite changes, including dropping expected values from 
many tests of intrinsics with no fp16 variant, and introducing a 
CHECK_RESULTS_NO_FP16 macro.

* Changed the mechanism on ARM by which we passed in -mfpu=neon-fp16: we now try 
to pass this into all tests, but fail the vcvt_f16.c if float16 is still not 
supported (e.g. there was a conflicting -mfpu=neon passed to the compiler, or we 
are running on HW which does not support the instructions).

Are these OK for trunk?

Thanks, Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
@ 2015-07-07 12:34 ` Alan Lawrence
  2015-07-07 13:09   ` Kyrill Tkachov
  2015-07-07 12:34 ` [PATCH 4/16][ARM] Add float16x8_t type Alan Lawrence
                   ` (14 subsequent siblings)
  15 siblings, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:34 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 65 bytes --]

As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01335.html

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 03_arm_float16x4.patch --]
[-- Type: text/x-patch; name=03_arm_float16x4.patch, Size: 10842 bytes --]

commit 54a89a084fbd00e4de036f549ca893b74b8f58fb
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Mon Dec 8 18:40:03 2014 +0000

    ARM: float16x4_t intrinsics (v2 - fix v[sg]et_lane_f16 at -O0, no vdup_n/vmov_n)

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index c923e29..b4100c8 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -41,6 +41,7 @@ typedef __simd64_int8_t int8x8_t;
 typedef __simd64_int16_t int16x4_t;
 typedef __simd64_int32_t int32x2_t;
 typedef __builtin_neon_di int64x1_t;
+typedef __builtin_neon_hf float16_t;
 typedef __simd64_float16_t float16x4_t;
 typedef __simd64_float32_t float32x2_t;
 typedef __simd64_poly8_t poly8x8_t;
@@ -5201,6 +5202,19 @@ vget_lane_s32 (int32x2_t __a, const int __b)
   return (int32_t)__builtin_neon_vget_lanev2si (__a, __b);
 }
 
+/* Functions cannot accept or return __FP16 types.  Even if the function
+   were marked always-inline so there were no call sites, the declaration
+   would nonetheless raise an error.  Hence, we must use a macro instead.  */
+
+#define vget_lane_f16(__v, __idx)		\
+  __extension__					\
+    ({						\
+      float16x4_t __vec = (__v);		\
+      __builtin_arm_lane_check (4, __idx);	\
+      float16_t __res = __vec[__idx];		\
+      __res;					\
+    })
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vget_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -5333,6 +5347,16 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
   return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
 }
 
+#define vset_lane_f16(__e, __v, __idx)		\
+  __extension__					\
+    ({						\
+      float16_t __elem = (__e);			\
+      float16x4_t __vec = (__v);		\
+      __builtin_arm_lane_check (4, __idx);	\
+      __vec[__idx] = __elem;			\
+      __vec;					\
+    })
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
 {
@@ -5479,6 +5503,12 @@ vcreate_s64 (uint64_t __a)
   return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcreate_f16 (uint64_t __a)
+{
+  return (float16x4_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcreate_f32 (uint64_t __a)
 {
@@ -8796,6 +8826,12 @@ vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
   return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_lane_f16 (const float16_t * __a, float16x4_t __b, const int __c)
+{
+  return vset_lane_f16 (*__a, __b, __c);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
 {
@@ -8944,6 +8980,13 @@ vld1_dup_s32 (const int32_t * __a)
   return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_dup_f16 (const float16_t * __a)
+{
+  float16_t __f = *__a;
+  return (float16x4_t) { __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_dup_f32 (const float32_t * __a)
 {
@@ -11828,6 +11871,12 @@ vreinterpret_p8_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_f16 (float16x4_t __a)
+{
+  return (poly8x8_t) __a;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vreinterpret_p8_f32 (float32x2_t __a)
 {
   return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
@@ -11896,6 +11945,12 @@ vreinterpret_p16_p8 (poly8x8_t __a)
 }
 
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_f16 (float16x4_t __a)
+{
+  return (poly16x4_t) __a;
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vreinterpret_p16_f32 (float32x2_t __a)
 {
   return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
@@ -11957,6 +12012,80 @@ vreinterpret_p16_u32 (uint32x2_t __a)
   return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p8 (poly8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p16 (poly16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_f32 (float32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p64 (poly64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+#endif
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s64 (int64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u64 (uint64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s8 (int8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s16 (int16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s32 (int32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u8 (uint8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u16 (uint16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u32 (uint32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vreinterpret_f32_p8 (poly8x8_t __a)
 {
@@ -11969,6 +12098,12 @@ vreinterpret_f32_p16 (poly16x4_t __a)
   return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_f16 (float16x4_t __a)
+{
+  return (float32x2_t) __a;
+}
+
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vreinterpret_f32_p64 (poly64x1_t __a)
@@ -12043,6 +12178,14 @@ vreinterpret_p64_p16 (poly16x4_t __a)
 #endif
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_f16 (float16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
 vreinterpret_p64_f32 (float32x2_t __a)
 {
   return (poly64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
@@ -12126,6 +12269,12 @@ vreinterpret_s64_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_f16 (float16x4_t __a)
+{
+  return (int64x1_t) __a;
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vreinterpret_s64_f32 (float32x2_t __a)
 {
   return (int64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
@@ -12194,6 +12343,12 @@ vreinterpret_u64_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_f16 (float16x4_t __a)
+{
+  return (uint64x1_t) __a;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vreinterpret_u64_f32 (float32x2_t __a)
 {
   return (uint64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
@@ -12262,6 +12417,12 @@ vreinterpret_s8_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_f16 (float16x4_t __a)
+{
+  return (int8x8_t) __a;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vreinterpret_s8_f32 (float32x2_t __a)
 {
   return (int8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
@@ -12330,6 +12491,12 @@ vreinterpret_s16_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_f16 (float16x4_t __a)
+{
+  return (int16x4_t) __a;
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vreinterpret_s16_f32 (float32x2_t __a)
 {
   return (int16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
@@ -12398,6 +12565,12 @@ vreinterpret_s32_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_f16 (float16x4_t __a)
+{
+  return (int32x2_t) __a;
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vreinterpret_s32_f32 (float32x2_t __a)
 {
   return (int32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
@@ -12466,6 +12639,12 @@ vreinterpret_u8_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_f16 (float16x4_t __a)
+{
+  return (uint8x8_t) __a;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vreinterpret_u8_f32 (float32x2_t __a)
 {
   return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
@@ -12534,6 +12713,12 @@ vreinterpret_u16_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t) __a;
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vreinterpret_u16_f32 (float32x2_t __a)
 {
   return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
@@ -12602,6 +12787,12 @@ vreinterpret_u32_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_f16 (float16x4_t __a)
+{
+  return (uint32x2_t) __a;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vreinterpret_u32_f32 (float32x2_t __a)
 {
   return (uint32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/16][ARM] PR/63870 Add qualifier to check lane bounds in expand
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (2 preceding siblings ...)
  2015-07-07 12:34 ` [PATCH 2/16][ARM] PR/63870 Add __builtin_arm_lane_check Alan Lawrence
@ 2015-07-07 12:34 ` Alan Lawrence
  2015-07-27 14:33   ` Kyrill Tkachov
  2015-07-07 12:35 ` [PATCH 7/16][AArch64] Add basic fp16 support Alan Lawrence
                   ` (11 subsequent siblings)
  15 siblings, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:34 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 195 bytes --]

As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01333.html

(While this falls under PR/63870, and I will link to that in the ChangeLog, it 
is only a small step towards fixing that PR.)

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 01_arm_qualifier_lane_index.patch --]
[-- Type: text/x-patch; name=01_arm_qualifier_lane_index.patch, Size: 19177 bytes --]

commit 9812db88cff20a505365f68f4065d2fbab998c9c
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Mon Dec 8 11:04:49 2014 +0000

    ARM: Add qualifier_lane_index

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index f960e0a..7f5bf87 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -77,7 +77,9 @@ enum arm_type_qualifiers
   /* qualifier_const_pointer | qualifier_map_mode  */
   qualifier_const_pointer_map_mode = 0x86,
   /* Polynomial types.  */
-  qualifier_poly = 0x100
+  qualifier_poly = 0x100,
+  /* Lane indices - must be within range of previous argument = a vector.  */
+  qualifier_lane_index = 0x200
 };
 
 /*  The qualifier_internal allows generation of a unary builtin from
@@ -108,21 +110,40 @@ arm_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 
 /* T (T, immediate).  */
 static enum arm_type_qualifiers
-arm_getlane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_immediate };
+#define BINOP_IMM_QUALIFIERS (arm_binop_imm_qualifiers)
+
+/* T (T, lane index).  */
+static enum arm_type_qualifiers
+arm_getlane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_lane_index };
 #define GETLANE_QUALIFIERS (arm_getlane_qualifiers)
 
 /* T (T, T, T, immediate).  */
 static enum arm_type_qualifiers
-arm_lanemac_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_mac_n_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none,
       qualifier_none, qualifier_immediate };
-#define LANEMAC_QUALIFIERS (arm_lanemac_qualifiers)
+#define MAC_N_QUALIFIERS (arm_mac_n_qualifiers)
+
+/* T (T, T, T, lane index).  */
+static enum arm_type_qualifiers
+arm_mac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none,
+      qualifier_none, qualifier_lane_index };
+#define MAC_LANE_QUALIFIERS (arm_mac_lane_qualifiers)
 
 /* T (T, T, immediate).  */
 static enum arm_type_qualifiers
-arm_setlane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate };
+#define TERNOP_IMM_QUALIFIERS (arm_ternop_imm_qualifiers)
+
+/* T (T, T, lane index).  */
+static enum arm_type_qualifiers
+arm_setlane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_lane_index };
 #define SETLANE_QUALIFIERS (arm_setlane_qualifiers)
 
 /* T (T, T).  */
@@ -1927,6 +1948,7 @@ arm_expand_unop_builtin (enum insn_code icode,
 typedef enum {
   NEON_ARG_COPY_TO_REG,
   NEON_ARG_CONSTANT,
+  NEON_ARG_LANE_INDEX,
   NEON_ARG_MEMORY,
   NEON_ARG_STOP
 } builtin_arg;
@@ -2043,6 +2065,16 @@ arm_expand_neon_args (rtx target, machine_mode map_mode, int fcode,
 		op[argc] = copy_to_mode_reg (mode[argc], op[argc]);
 	      break;
 
+	    case NEON_ARG_LANE_INDEX:
+	      /* Previous argument must be a vector, which this indexes.  */
+	      gcc_assert (argc > 0);
+	      if (CONST_INT_P (op[argc]))
+		{
+		  enum machine_mode vmode = mode[argc - 1];
+		  neon_lane_bounds (op[argc], 0, GET_MODE_NUNITS (vmode), exp);
+		}
+	      /* Fall through - if the lane index isn't a constant then
+		 the next case will error.  */
 	    case NEON_ARG_CONSTANT:
 	      if (!(*insn_data[icode].operand[opno].predicate)
 		  (op[argc], mode[argc]))
@@ -2170,7 +2202,9 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
       int operands_k = k - is_void;
       int expr_args_k = k - 1;
 
-      if (d->qualifiers[qualifiers_k] & qualifier_immediate)
+      if (d->qualifiers[qualifiers_k] & qualifier_lane_index)
+	args[k] = NEON_ARG_LANE_INDEX;
+      else if (d->qualifiers[qualifiers_k] & qualifier_immediate)
 	args[k] = NEON_ARG_CONSTANT;
       else if (d->qualifiers[qualifiers_k] & qualifier_maybe_immediate)
 	{
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 62f91ef..25bdebd 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -86,7 +86,7 @@ extern void neon_pairwise_reduce (rtx, rtx, machine_mode,
 extern rtx neon_make_constant (rtx);
 extern tree arm_builtin_vectorized_function (tree, tree, tree);
 extern void neon_expand_vector_init (rtx, rtx);
-extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT);
+extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree);
 extern void neon_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT);
 extern HOST_WIDE_INT neon_element_bits (machine_mode);
 extern void neon_reinterpret (rtx, rtx);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index e79a369..6e074ea 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12788,12 +12788,12 @@ neon_expand_vector_init (rtx target, rtx vals)
 }
 
 /* Ensure OPERAND lies between LOW (inclusive) and HIGH (exclusive).  Raise
-   ERR if it doesn't.  FIXME: NEON bounds checks occur late in compilation, so
-   reported source locations are bogus.  */
+   ERR if it doesn't.  EXP indicates the source location, which includes the
+   inlining history for intrinsics.  */
 
 static void
 bounds_check (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
-	      const char *err)
+	      const_tree exp, const char *desc)
 {
   HOST_WIDE_INT lane;
 
@@ -12802,15 +12802,22 @@ bounds_check (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
   lane = INTVAL (operand);
 
   if (lane < low || lane >= high)
-    error (err);
+    {
+      if (exp)
+	error ("%K%s %lld out of range %lld - %lld",
+	       exp, desc, lane, low, high - 1);
+      else
+	error ("%s %lld out of range %lld - %lld", desc, lane, low, high - 1);
+    }
 }
 
 /* Bounds-check lanes.  */
 
 void
-neon_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
+neon_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
+		  const_tree exp)
 {
-  bounds_check (operand, low, high, "lane out of range");
+  bounds_check (operand, low, high, exp, "lane");
 }
 
 /* Bounds-check constants.  */
@@ -12818,7 +12825,7 @@ neon_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
 void
 neon_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
 {
-  bounds_check (operand, low, high, "constant out of range");
+  bounds_check (operand, low, high, NULL_TREE, "constant");
 }
 
 HOST_WIDE_INT
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f55591d..f150b98 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -67,28 +67,28 @@ VAR8 (BINOP, vqshls, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (BINOP, vqshlu, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (BINOP, vqrshls, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (BINOP, vqrshlu, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (GETLANE, vshrs_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (GETLANE, vshru_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (GETLANE, vrshrs_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (GETLANE, vrshru_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR3 (GETLANE, vshrn_n, v8hi, v4si, v2di)
-VAR3 (GETLANE, vrshrn_n, v8hi, v4si, v2di)
-VAR3 (GETLANE, vqshrns_n, v8hi, v4si, v2di)
-VAR3 (GETLANE, vqshrnu_n, v8hi, v4si, v2di)
-VAR3 (GETLANE, vqrshrns_n, v8hi, v4si, v2di)
-VAR3 (GETLANE, vqrshrnu_n, v8hi, v4si, v2di)
-VAR3 (GETLANE, vqshrun_n, v8hi, v4si, v2di)
-VAR3 (GETLANE, vqrshrun_n, v8hi, v4si, v2di)
-VAR8 (GETLANE, vshl_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (GETLANE, vqshl_s_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (GETLANE, vqshl_u_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (GETLANE, vqshlu_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR3 (GETLANE, vshlls_n, v8qi, v4hi, v2si)
-VAR3 (GETLANE, vshllu_n, v8qi, v4hi, v2si)
-VAR8 (SETLANE, vsras_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (SETLANE, vsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (SETLANE, vrsras_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (SETLANE, vrsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vshrs_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vshru_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vrshrs_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vrshru_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vshrn_n, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vrshrn_n, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vqshrns_n, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vqshrnu_n, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vqrshrns_n, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vqrshrnu_n, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vqshrun_n, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vqrshrun_n, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vshl_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vqshl_s_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vqshl_u_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (BINOP_IMM, vqshlu_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR3 (BINOP_IMM, vshlls_n, v8qi, v4hi, v2si)
+VAR3 (BINOP_IMM, vshllu_n, v8qi, v4hi, v2si)
+VAR8 (TERNOP_IMM, vsras_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (TERNOP_IMM, vsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (TERNOP_IMM, vrsras_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (TERNOP_IMM, vrsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR2 (BINOP, vsub, v2sf, v4sf)
 VAR3 (BINOP, vsubls, v8qi, v4hi, v2si)
 VAR3 (BINOP, vsublu, v8qi, v4hi, v2si)
@@ -140,8 +140,8 @@ VAR6 (BINOP, vpadals, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR6 (BINOP, vpadalu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR2 (BINOP, vrecps, v2sf, v4sf)
 VAR2 (BINOP, vrsqrts, v2sf, v4sf)
-VAR8 (SETLANE, vsri_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
-VAR8 (SETLANE, vsli_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (TERNOP_IMM, vsri_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
+VAR8 (TERNOP_IMM, vsli_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
 VAR8 (UNOP, vabs, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
 VAR6 (UNOP, vqabs, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR8 (UNOP, vneg, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
@@ -162,7 +162,7 @@ VAR10 (SETLANE, vset_lane,
 VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di)
 VAR10 (UNOP, vdup_n,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR10 (BINOP, vdup_lane,
+VAR10 (GETLANE, vdup_lane,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
 VAR5 (COMBINE, vcombine, v8qi, v4hi, v2si, v2sf, di)
 VAR5 (UNOP, vget_high, v16qi, v8hi, v4si, v4sf, v2di)
@@ -174,23 +174,23 @@ VAR3 (UNOP, vqmovun, v8hi, v4si, v2di)
 VAR3 (UNOP, vmovls, v8qi, v4hi, v2si)
 VAR3 (UNOP, vmovlu, v8qi, v4hi, v2si)
 VAR6 (SETLANE, vmul_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR6 (LANEMAC, vmla_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR2 (LANEMAC, vmlals_lane, v4hi, v2si)
-VAR2 (LANEMAC, vmlalu_lane, v4hi, v2si)
-VAR2 (LANEMAC, vqdmlal_lane, v4hi, v2si)
-VAR6 (LANEMAC, vmls_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR2 (LANEMAC, vmlsls_lane, v4hi, v2si)
-VAR2 (LANEMAC, vmlslu_lane, v4hi, v2si)
-VAR2 (LANEMAC, vqdmlsl_lane, v4hi, v2si)
+VAR6 (MAC_LANE, vmla_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR2 (MAC_LANE, vmlals_lane, v4hi, v2si)
+VAR2 (MAC_LANE, vmlalu_lane, v4hi, v2si)
+VAR2 (MAC_LANE, vqdmlal_lane, v4hi, v2si)
+VAR6 (MAC_LANE, vmls_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR2 (MAC_LANE, vmlsls_lane, v4hi, v2si)
+VAR2 (MAC_LANE, vmlslu_lane, v4hi, v2si)
+VAR2 (MAC_LANE, vqdmlsl_lane, v4hi, v2si)
 VAR6 (BINOP, vmul_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR6 (LANEMAC, vmla_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR2 (LANEMAC, vmlals_n, v4hi, v2si)
-VAR2 (LANEMAC, vmlalu_n, v4hi, v2si)
-VAR2 (LANEMAC, vqdmlal_n, v4hi, v2si)
-VAR6 (LANEMAC, vmls_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR2 (LANEMAC, vmlsls_n, v4hi, v2si)
-VAR2 (LANEMAC, vmlslu_n, v4hi, v2si)
-VAR2 (LANEMAC, vqdmlsl_n, v4hi, v2si)
+VAR6 (MAC_N, vmla_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR2 (MAC_N, vmlals_n, v4hi, v2si)
+VAR2 (MAC_N, vmlalu_n, v4hi, v2si)
+VAR2 (MAC_N, vqdmlal_n, v4hi, v2si)
+VAR6 (MAC_N, vmls_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR2 (MAC_N, vmlsls_n, v4hi, v2si)
+VAR2 (MAC_N, vmlslu_n, v4hi, v2si)
+VAR2 (MAC_N, vqdmlsl_n, v4hi, v2si)
 VAR10 (SETLANE, vext,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
 VAR8 (UNOP, vrev64, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..4af74ce 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2663,8 +2663,6 @@
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[2], 0, GET_MODE_NUNITS (<MODE>mode));
-
   if (BYTES_BIG_ENDIAN)
     {
       /* The intrinsics are defined in terms of a model where the
@@ -2694,8 +2692,6 @@
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[2], 0, GET_MODE_NUNITS (<MODE>mode));
-
   if (BYTES_BIG_ENDIAN)
     {
       /* The intrinsics are defined in terms of a model where the
@@ -2725,7 +2721,6 @@
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[2], 0, 1);
   emit_move_insn (operands[0], operands[1]);
   DONE;
 })
@@ -2736,18 +2731,11 @@
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_NEON"
 {
-  switch (INTVAL (operands[2]))
-    {
-    case 0:
-      emit_move_insn (operands[0], gen_lowpart (DImode, operands[1]));
-      break;
-    case 1:
-      emit_move_insn (operands[0], gen_highpart (DImode, operands[1]));
-      break;
-    default:
-      neon_lane_bounds (operands[2], 0, 1);
-      FAIL;
-    }
+  int lane = INTVAL (operands[2]);
+  gcc_assert ((lane ==0) || (lane == 1));
+  emit_move_insn (operands[0], lane == 0
+				? gen_lowpart (DImode, operands[1])
+				: gen_highpart (DImode, operands[1]));
   DONE;
 })
 
@@ -2759,7 +2747,6 @@
   "TARGET_NEON"
 {
   unsigned int elt = INTVAL (operands[3]);
-  neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<MODE>mode));
 
   if (BYTES_BIG_ENDIAN)
     {
@@ -2782,7 +2769,6 @@
    (match_operand:SI 3 "immediate_operand" "i")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[3], 0, 1);
   emit_move_insn (operands[0], operands[1]);
   DONE;
 })
@@ -2864,7 +2850,6 @@
    (match_operand:SI 2 "immediate_operand" "i")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[2], 0, GET_MODE_NUNITS (<V_double_vector_mode>mode));
   if (BYTES_BIG_ENDIAN)
     {
       unsigned int elt = INTVAL (operands[2]);
@@ -2885,7 +2870,6 @@
    (match_operand:SI 2 "immediate_operand" "i")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[2], 0, 1);
   emit_move_insn (operands[0], operands[1]);
   DONE;
 })
@@ -2897,7 +2881,6 @@
    (match_operand:SI 2 "immediate_operand" "i")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[2], 0, 1);
   emit_insn (gen_neon_vdup_nv2di (operands[0], operands[1]));
   DONE;
 })
@@ -3097,7 +3080,6 @@
                     UNSPEC_VMUL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmul.<V_if_elem>\t%P0, %P1, %P2[%c3]";
 }
   [(set (attr "type")
@@ -3115,7 +3097,6 @@
                     UNSPEC_VMUL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<V_HALF>mode));
   return "vmul.<V_if_elem>\t%q0, %q1, %P2[%c3]";
 }
   [(set (attr "type")
@@ -3133,7 +3114,6 @@
                           VMULL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmull.<sup>%#<V_sz_elem>\t%q0, %P1, %P2[%c3]";
 }
   [(set_attr "type" "neon_mul_<V_elem_ch>_scalar_long")]
@@ -3148,7 +3128,6 @@
                           UNSPEC_VQDMULL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vqdmull.<V_s_elem>\t%q0, %P1, %P2[%c3]";
 }
   [(set_attr "type" "neon_sat_mul_<V_elem_ch>_scalar_long")]
@@ -3163,7 +3142,6 @@
                       VQDMULH_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vq<r>dmulh.<V_s_elem>\t%q0, %q1, %P2[%c3]";
 }
   [(set_attr "type" "neon_sat_mul_<V_elem_ch>_scalar_q")]
@@ -3178,7 +3156,6 @@
                       VQDMULH_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vq<r>dmulh.<V_s_elem>\t%P0, %P1, %P2[%c3]";
 }
   [(set_attr "type" "neon_sat_mul_<V_elem_ch>_scalar_q")]
@@ -3194,7 +3171,6 @@
                      UNSPEC_VMLA_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmla.<V_if_elem>\t%P0, %P2, %P3[%c4]";
 }
   [(set (attr "type")
@@ -3213,7 +3189,6 @@
                      UNSPEC_VMLA_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmla.<V_if_elem>\t%q0, %q2, %P3[%c4]";
 }
   [(set (attr "type")
@@ -3232,7 +3207,6 @@
                           VMLAL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmlal.<sup>%#<V_sz_elem>\t%q0, %P2, %P3[%c4]";
 }
   [(set_attr "type" "neon_mla_<V_elem_ch>_scalar_long")]
@@ -3248,7 +3222,6 @@
                           UNSPEC_VQDMLAL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vqdmlal.<V_s_elem>\t%q0, %P2, %P3[%c4]";
 }
   [(set_attr "type" "neon_sat_mla_<V_elem_ch>_scalar_long")]
@@ -3264,7 +3237,6 @@
                     UNSPEC_VMLS_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmls.<V_if_elem>\t%P0, %P2, %P3[%c4]";
 }
   [(set (attr "type")
@@ -3283,7 +3255,6 @@
                     UNSPEC_VMLS_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmls.<V_if_elem>\t%q0, %q2, %P3[%c4]";
 }
   [(set (attr "type")
@@ -3302,7 +3273,6 @@
                           VMLSL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vmlsl.<sup>%#<V_sz_elem>\t%q0, %P2, %P3[%c4]";
 }
   [(set_attr "type" "neon_mla_<V_elem_ch>_scalar_long")]
@@ -3318,7 +3288,6 @@
                           UNSPEC_VQDMLSL_LANE))]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (<MODE>mode));
   return "vqdmlsl.<V_s_elem>\t%q0, %P2, %P3[%c4]";
 }
   [(set_attr "type" "neon_sat_mla_<V_elem_ch>_scalar_long")]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/16][ARM] PR/63870 Add __builtin_arm_lane_check.
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
  2015-07-07 12:34 ` [PATCH 3/16][ARM] Add float16x4_t intrinsics Alan Lawrence
  2015-07-07 12:34 ` [PATCH 4/16][ARM] Add float16x8_t type Alan Lawrence
@ 2015-07-07 12:34 ` Alan Lawrence
  2015-07-27 14:30   ` Kyrill Tkachov
  2015-07-07 12:34 ` [PATCH 1/16][ARM] PR/63870 Add qualifier to check lane bounds in expand Alan Lawrence
                   ` (12 subsequent siblings)
  15 siblings, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:34 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 65 bytes --]

As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01334.html

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 02_arm_builtin_arm_lane_check.patch --]
[-- Type: text/x-patch; name=02_arm_builtin_arm_lane_check.patch, Size: 2965 bytes --]

commit 1bb1b208a2c8c8b1ee1186c6128a498583fd64fe
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Mon Dec 8 18:36:30 2014 +0000

    Add __builtin_arm_lane_check

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 7f5bf87..89b1b0c 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -534,12 +534,16 @@ enum arm_builtins
 #undef CRYPTO2
 #undef CRYPTO3
 
+  ARM_BUILTIN_NEON_BASE,
+  ARM_BUILTIN_NEON_LANE_CHECK = ARM_BUILTIN_NEON_BASE,
+
 #include "arm_neon_builtins.def"
 
   ARM_BUILTIN_MAX
 };
 
-#define ARM_BUILTIN_NEON_BASE (ARM_BUILTIN_MAX - ARRAY_SIZE (neon_builtin_data))
+#define ARM_BUILTIN_NEON_PATTERN_START \
+    (ARM_BUILTIN_MAX - ARRAY_SIZE (neon_builtin_data))
 
 #undef CF
 #undef VAR1
@@ -898,7 +902,7 @@ arm_init_simd_builtin_scalar_types (void)
 static void
 arm_init_neon_builtins (void)
 {
-  unsigned int i, fcode = ARM_BUILTIN_NEON_BASE;
+  unsigned int i, fcode = ARM_BUILTIN_NEON_PATTERN_START;
 
   arm_init_simd_builtin_types ();
 
@@ -908,6 +912,15 @@ arm_init_neon_builtins (void)
      system.  */
   arm_init_simd_builtin_scalar_types ();
 
+  tree lane_check_fpr = build_function_type_list (void_type_node,
+						  intSI_type_node,
+						  intSI_type_node,
+						  NULL);
+  arm_builtin_decls[ARM_BUILTIN_NEON_LANE_CHECK] =
+      add_builtin_function ("__builtin_arm_lane_check", lane_check_fpr,
+			    ARM_BUILTIN_NEON_LANE_CHECK, BUILT_IN_MD,
+			    NULL, NULL_TREE);
+
   for (i = 0; i < ARRAY_SIZE (neon_builtin_data); i++, fcode++)
     {
       bool print_type_signature_p = false;
@@ -2171,14 +2184,28 @@ arm_expand_neon_args (rtx target, machine_mode map_mode, int fcode,
   return target;
 }
 
-/* Expand a Neon builtin. These are "special" because they don't have symbolic
+/* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
+   Most of these are "special" because they don't have symbolic
    constants defined per-instruction or per instruction-variant. Instead, the
    required info is looked up in the table neon_builtin_data.  */
 static rtx
 arm_expand_neon_builtin (int fcode, tree exp, rtx target)
 {
+  if (fcode == ARM_BUILTIN_NEON_LANE_CHECK)
+    {
+      tree nlanes = CALL_EXPR_ARG (exp, 0);
+      gcc_assert (TREE_CODE (nlanes) == INTEGER_CST);
+      rtx lane_idx = expand_normal (CALL_EXPR_ARG (exp, 1));
+      if (CONST_INT_P (lane_idx))
+	neon_lane_bounds (lane_idx, 0, TREE_INT_CST_LOW (nlanes), exp);
+      else
+	error ("%Klane index must be a constant immediate", exp);
+      /* Don't generate any RTL.  */
+      return const0_rtx;
+    }
+
   neon_builtin_datum *d =
-		&neon_builtin_data[fcode - ARM_BUILTIN_NEON_BASE];
+		&neon_builtin_data[fcode - ARM_BUILTIN_NEON_PATTERN_START];
   enum insn_code icode = d->code;
   builtin_arg args[SIMD_MAX_BUILTIN_ARGS];
   int num_args = insn_data[d->code].n_operands;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 4/16][ARM] Add float16x8_t type
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
  2015-07-07 12:34 ` [PATCH 3/16][ARM] Add float16x4_t intrinsics Alan Lawrence
@ 2015-07-07 12:34 ` Alan Lawrence
  2015-07-07 12:34 ` [PATCH 2/16][ARM] PR/63870 Add __builtin_arm_lane_check Alan Lawrence
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:34 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 74 bytes --]

Unchanged since https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01336.html

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 04_arm_v8hf.patch --]
[-- Type: text/x-patch; name=04_arm_v8hf.patch, Size: 3229 bytes --]

commit b9ccac6243415b304024443b74bdc97b3a5954f2
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Mon Dec 8 18:40:24 2014 +0000

    Add float16x8_t + V8HFmode support (regardless of -mfp16-format)

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 89b1b0c..17e39d8 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -192,6 +192,7 @@ arm_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define di_UP    DImode
 #define v16qi_UP V16QImode
 #define v8hi_UP  V8HImode
+#define v8hf_UP  V8HFmode
 #define v4si_UP  V4SImode
 #define v4sf_UP  V4SFmode
 #define v2di_UP  V2DImode
@@ -827,6 +828,7 @@ arm_init_simd_builtin_types (void)
   /* Continue with standard types.  */
   arm_simd_types[Float16x4_t].eltype = arm_simd_floatHF_type_node;
   arm_simd_types[Float32x2_t].eltype = float_type_node;
+  arm_simd_types[Float16x8_t].eltype = arm_simd_floatHF_type_node;
   arm_simd_types[Float32x4_t].eltype = float_type_node;
 
   for (i = 0; i < nelts; i++)
diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def
index bcbd20b..b178ae6 100644
--- a/gcc/config/arm/arm-simd-builtin-types.def
+++ b/gcc/config/arm/arm-simd-builtin-types.def
@@ -44,5 +44,7 @@
 
   ENTRY (Float16x4_t, V4HF, none, 64, float16, 18)
   ENTRY (Float32x2_t, V2SF, none, 64, float32, 18)
+
+  ENTRY (Float16x8_t, V8HF, none, 128, float16, 19)
   ENTRY (Float32x4_t, V4SF, none, 128, float32, 19)
 
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 6e074ea..0faa46c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -26251,7 +26251,8 @@ arm_vector_mode_supported_p (machine_mode mode)
 {
   /* Neon also supports V2SImode, etc. listed in the clause below.  */
   if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
-      || mode == V4HFmode || mode == V16QImode || mode == V4SFmode || mode == V2DImode))
+      || mode ==V4HFmode || mode == V16QImode || mode == V4SFmode
+      || mode == V2DImode || mode == V8HFmode))
     return true;
 
   if ((TARGET_NEON || TARGET_IWMMXT)
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 373dc85..c0a83b2 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -999,7 +999,7 @@ extern int arm_arch_crc;
 /* Modes valid for Neon Q registers.  */
 #define VALID_NEON_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
-   || (MODE) == V4SFmode || (MODE) == V2DImode)
+   || (MODE) == V8HFmode || (MODE) == V4SFmode || (MODE) == V2DImode)
 
 /* Structure modes valid for Neon registers.  */
 #define VALID_NEON_STRUCT_MODE(MODE) \
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index b4100c8..a958f63 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -58,6 +58,7 @@ typedef __simd128_int8_t int8x16_t;
 typedef __simd128_int16_t int16x8_t;
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
+typedef __simd128_float16_t float16x8_t;
 typedef __simd128_float32_t float32x4_t;
 typedef __simd128_poly8_t poly8x16_t;
 typedef __simd128_poly16_t poly16x8_t;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 5/16][ARM] Add float16x8_t intrinsics
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (4 preceding siblings ...)
  2015-07-07 12:35 ` [PATCH 7/16][AArch64] Add basic fp16 support Alan Lawrence
@ 2015-07-07 12:35 ` Alan Lawrence
  2015-07-07 12:35 ` [PATCH 6/16][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low/high, vcombine Alan Lawrence
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:35 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 65 bytes --]

As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01337.html

[-- Attachment #2: 05_arm_float16x8.pat --]
[-- Type: text/plain, Size: 10760 bytes --]

commit 336eb16d3061131fe8d28fad4a473d00768bfe5c
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Tue Dec 9 15:06:38 2014 +0000

    ARM float16x8_t intrinsics (v2 - fix v[sg]etq_lane_f16, add vreinterpretq_p16_f16, no vdup_n/lane/vmov_n)

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index a958f63..db73c70 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -5282,6 +5282,15 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
   return (int32_t)__builtin_neon_vget_lanev4si (__a, __b);
 }
 
+#define vgetq_lane_f16(__v, __idx)		\
+  __extension__					\
+    ({						\
+      float16x8_t __vec = (__v);		\
+      __builtin_arm_lane_check (8, __idx);	\
+      float16_t __res = __vec[__idx];		\
+      __res;					\
+    })
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vgetq_lane_f32 (float32x4_t __a, const int __b)
 {
@@ -5424,6 +5433,16 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
   return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
 }
 
+#define vsetq_lane_f16(__e, __v, __idx)		\
+  __extension__					\
+    ({						\
+      float16_t __elem = (__e);			\
+      float16x8_t __vec = (__v);		\
+      __builtin_arm_lane_check (8, __idx);	\
+      __vec[__idx] = __elem;			\
+      __vec;					\
+    })
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
 {
@@ -8907,6 +8926,12 @@ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
   return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c)
+{
+  return vsetq_lane_f16 (*__a, __b, __c);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
 {
@@ -9062,6 +9087,13 @@ vld1q_dup_s32 (const int32_t * __a)
   return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_f16 (const float16_t * __a)
+{
+  float16_t __f = *__a;
+  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t * __a)
 {
@@ -12856,6 +12888,12 @@ vreinterpretq_p8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f16 (float16x8_t __a)
+{
+  return (poly8x16_t) __a;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_p8_f32 (float32x4_t __a)
 {
   return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -12932,6 +12970,12 @@ vreinterpretq_p16_p8 (poly8x16_t __a)
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_f16 (float16x8_t __a)
+{
+  return (poly16x8_t) __a;
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_p16_f32 (float32x4_t __a)
 {
   return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13001,6 +13045,88 @@ vreinterpretq_p16_u32 (uint32x4_t __a)
   return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p8 (poly8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p16 (poly16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_f32 (float32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p64 (poly64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p128 (poly128_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+#endif
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s64 (int64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u64 (uint64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s8 (int8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s16 (int16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s32 (int32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u8 (uint8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u16 (uint16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u32 (uint32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_p8 (poly8x16_t __a)
 {
@@ -13013,6 +13139,12 @@ vreinterpretq_f32_p16 (poly16x8_t __a)
   return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
 }
 
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_f16 (float16x8_t __a)
+{
+  return (float32x4_t) __a;
+}
+
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_p64 (poly64x2_t __a)
@@ -13095,6 +13227,14 @@ vreinterpretq_p64_p16 (poly16x8_t __a)
 #endif
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_f16 (float16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_p64_f32 (float32x4_t __a)
 {
   return (poly64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13191,6 +13331,14 @@ vreinterpretq_p128_p16 (poly16x8_t __a)
 #endif
 #ifdef __ARM_FEATURE_CRYPTO
 __extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+vreinterpretq_p128_f16 (float16x8_t __a)
+{
+  return (poly128_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
 vreinterpretq_p128_f32 (float32x4_t __a)
 {
   return (poly128_t)__builtin_neon_vreinterprettiv4sf (__a);
@@ -13282,6 +13430,12 @@ vreinterpretq_s64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f16 (float16x8_t __a)
+{
+  return (int64x2_t) __a;
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_s64_f32 (float32x4_t __a)
 {
   return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13358,6 +13512,12 @@ vreinterpretq_u64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f16 (float16x8_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_u64_f32 (float32x4_t __a)
 {
   return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13434,6 +13594,12 @@ vreinterpretq_s8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f16 (float16x8_t __a)
+{
+  return (int8x16_t) __a;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_s8_f32 (float32x4_t __a)
 {
   return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -13510,6 +13676,12 @@ vreinterpretq_s16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f16 (float16x8_t __a)
+{
+  return (int16x8_t) __a;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_s16_f32 (float32x4_t __a)
 {
   return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13586,6 +13758,12 @@ vreinterpretq_s32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f16 (float16x8_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
@@ -13662,6 +13840,12 @@ vreinterpretq_u8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f16 (float16x8_t __a)
+{
+  return (uint8x16_t) __a;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_u8_f32 (float32x4_t __a)
 {
   return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -13738,6 +13922,12 @@ vreinterpretq_u16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t) __a;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_u16_f32 (float32x4_t __a)
 {
   return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13814,6 +14004,12 @@ vreinterpretq_u32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f16 (float16x8_t __a)
+{
+  return (uint32x4_t) __a;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 6/16][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low/high, vcombine
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (5 preceding siblings ...)
  2015-07-07 12:35 ` [PATCH 5/16][ARM] Add float16x8_t intrinsics Alan Lawrence
@ 2015-07-07 12:35 ` Alan Lawrence
  2015-07-07 12:36 ` [PATCH 9/16][AArch64] Add support for float16x{4,8}_t vectors/builtins Alan Lawrence
                   ` (8 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:35 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 65 bytes --]

As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01341.html

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 06_arm_iterators.patch --]
[-- Type: text/x-patch; name=06_arm_iterators.patch, Size: 48196 bytes --]

commit ae6264b144d25fadcbf219e68ddf3d8c5f40be34
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Thu Dec 11 11:53:59 2014 +0000

    ARM 4/4 v2: v(ld|st)[234](q?|_lane|_dup), vcombine, vget_(low|high) (v2 w/ V_uf_sclr)
    
    All are tied together with so many iterators!
    
    Also vec_extract

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 17e39d8..1ee0a3d 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -241,6 +241,12 @@ typedef struct {
 #define VAR10(T, N, A, B, C, D, E, F, G, H, I, J) \
   VAR9 (T, N, A, B, C, D, E, F, G, H, I) \
   VAR1 (T, N, J)
+#define VAR11(T, N, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR10 (T, N, A, B, C, D, E, F, G, H, I, J) \
+  VAR1 (T, N, K)
+#define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR1 (T, N, L)
 
 /* The NEON builtin data can be found in arm_neon_builtins.def.
    The mode entries in the following table correspond to the "key" type of the
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index db73c70..93fb44f 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -162,6 +162,16 @@ typedef struct uint64x2x2_t
   uint64x2_t val[2];
 } uint64x2x2_t;
 
+typedef struct float16x4x2_t
+{
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t
+{
+  float16x8_t val[2];
+} float16x8x2_t;
+
 typedef struct float32x2x2_t
 {
   float32x2_t val[2];
@@ -288,6 +298,16 @@ typedef struct uint64x2x3_t
   uint64x2_t val[3];
 } uint64x2x3_t;
 
+typedef struct float16x4x3_t
+{
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t
+{
+  float16x8_t val[3];
+} float16x8x3_t;
+
 typedef struct float32x2x3_t
 {
   float32x2_t val[3];
@@ -414,6 +434,16 @@ typedef struct uint64x2x4_t
   uint64x2_t val[4];
 } uint64x2x4_t;
 
+typedef struct float16x4x4_t
+{
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t
+{
+  float16x8_t val[4];
+} float16x8x4_t;
+
 typedef struct float32x2x4_t
 {
   float32x2_t val[4];
@@ -6031,6 +6061,12 @@ vcombine_s64 (int64x1_t __a, int64x1_t __b)
   return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcombine_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vcombinev4hf (__a, __b);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vcombine_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -6105,6 +6141,12 @@ vget_high_s64 (int64x2_t __a)
   return (int64x1_t)__builtin_neon_vget_highv2di (__a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_high_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vget_highv8hf (__a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_high_f32 (float32x4_t __a)
 {
@@ -6165,6 +6207,12 @@ vget_low_s32 (int32x4_t __a)
   return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_low_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vget_lowv8hf (__a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_low_f32 (float32x4_t __a)
 {
@@ -8712,6 +8760,12 @@ vld1_s64 (const int64_t * __a)
   return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_f16 (const float16_t * __a)
+{
+  return __builtin_neon_vld1v4hf ((const __builtin_neon_hf *) __a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_f32 (const float32_t * __a)
 {
@@ -8786,6 +8840,12 @@ vld1q_s64 (const int64_t * __a)
   return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_f16 (const float16_t * __a)
+{
+  return __builtin_neon_vld1v8hf ((const __builtin_neon_hf *) __a);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_f32 (const float32_t * __a)
 {
@@ -9183,6 +9243,12 @@ vst1_s64 (int64_t * __a, int64x1_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f16 (float16_t * __a, float16x4_t __b)
+{
+  __builtin_neon_vst1v4hf ((__builtin_neon_hf *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_f32 (float32_t * __a, float32x2_t __b)
 {
   __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b);
@@ -9257,6 +9323,12 @@ vst1q_s64 (int64_t * __a, int64x2_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f16 (float16_t * __a, float16x8_t __b)
+{
+  __builtin_neon_vst1v8hf ((__builtin_neon_hf *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_f32 (float32_t * __a, float32x4_t __b)
 {
   __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b);
@@ -9317,6 +9389,12 @@ vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_f16 (float16_t * __a, float16x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4hf ((__builtin_neon_hf *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c);
@@ -9391,6 +9469,12 @@ vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_f16 (float16_t * __a, float16x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8hf ((__builtin_neon_hf *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
 {
   __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c);
@@ -9470,6 +9554,14 @@ vld2_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_f16 (const float16_t * __a)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_f32 (const float32_t * __a)
 {
@@ -9568,6 +9660,14 @@ vld2q_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_f16 (const float16_t * __a)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_f32 (const float32_t * __a)
 {
@@ -9643,6 +9743,16 @@ vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
 {
@@ -9715,6 +9825,16 @@ vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
 {
@@ -9775,6 +9895,13 @@ vld2_dup_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_f16 (const float16_t * __a)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_dup_f32 (const float32_t * __a)
 {
@@ -9871,6 +9998,13 @@ vst2_s32 (int32_t * __a, int32x2x2_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_f16 (float16_t * __a, float16x4x2_t __b)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v4hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2_f32 (float32_t * __a, float32x2x2_t __b)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -9957,6 +10091,13 @@ vst2q_s32 (int32_t * __a, int32x4x2_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_f16 (float16_t * __a, float16x8x2_t __b)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v8hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f32 (float32_t * __a, float32x4x2_t __b)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10020,6 +10161,13 @@ vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
+{
+  union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
 {
   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
@@ -10076,6 +10224,13 @@ vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
 {
   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -10127,6 +10282,14 @@ vld3_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_f16 (const float16_t * __a)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_f32 (const float32_t * __a)
 {
@@ -10225,6 +10388,14 @@ vld3q_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_f16 (const float16_t * __a)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_f32 (const float32_t * __a)
 {
@@ -10300,6 +10471,16 @@ vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
 {
@@ -10372,6 +10553,16 @@ vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
 {
@@ -10432,6 +10623,14 @@ vld3_dup_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_f16 (const float16_t * __a)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_dup_f32 (const float32_t * __a)
 {
@@ -10528,6 +10727,13 @@ vst3_s32 (int32_t * __a, int32x2x3_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_f16 (float16_t * __a, float16x4x3_t __b)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v4hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3_f32 (float32_t * __a, float32x2x3_t __b)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10614,6 +10820,13 @@ vst3q_s32 (int32_t * __a, int32x4x3_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_f16 (float16_t * __a, float16x8x3_t __b)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v8hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f32 (float32_t * __a, float32x4x3_t __b)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10677,6 +10890,13 @@ vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
+{
+  union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
 {
   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
@@ -10733,6 +10953,13 @@ vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
 {
   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
@@ -10784,6 +11011,14 @@ vld4_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_f16 (const float16_t * __a)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_f32 (const float32_t * __a)
 {
@@ -10882,6 +11117,14 @@ vld4q_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_f16 (const float16_t * __a)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_f32 (const float32_t * __a)
 {
@@ -10957,6 +11200,16 @@ vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
 {
@@ -11029,6 +11282,16 @@ vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8hf ((const __builtin_neon_hf *) __a,
+					   __bu.__o, __c);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
 {
@@ -11089,6 +11352,14 @@ vld4_dup_s32 (const int32_t * __a)
   return __rv.__i;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_f16 (const float16_t * __a)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv4hf ((const __builtin_neon_hf *) __a);
+  return __rv.__i;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_dup_f32 (const float32_t * __a)
 {
@@ -11185,6 +11456,13 @@ vst4_s32 (int32_t * __a, int32x2x4_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_f16 (float16_t * __a, float16x4x4_t __b)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v4hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4_f32 (float32_t * __a, float32x2x4_t __b)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11271,6 +11549,13 @@ vst4q_s32 (int32_t * __a, int32x4x4_t __b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_f16 (float16_t * __a, float16x8x4_t __b)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v8hf ((__builtin_neon_hf *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f32 (float32_t * __a, float32x4x4_t __b)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
@@ -11334,6 +11619,13 @@ vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
+{
+  union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
 {
   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
@@ -11390,6 +11682,13 @@ vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8hf ((__builtin_neon_hf *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
 {
   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f150b98..0b719df 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -164,9 +164,9 @@ VAR10 (UNOP, vdup_n,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
 VAR10 (GETLANE, vdup_lane,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR5 (COMBINE, vcombine, v8qi, v4hi, v2si, v2sf, di)
-VAR5 (UNOP, vget_high, v16qi, v8hi, v4si, v4sf, v2di)
-VAR5 (UNOP, vget_low, v16qi, v8hi, v4si, v4sf, v2di)
+VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
 VAR3 (UNOP, vmovn, v8hi, v4si, v2di)
 VAR3 (UNOP, vqmovns, v8hi, v4si, v2di)
 VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
@@ -242,40 +242,40 @@ VAR6 (UNOP, vreinterpretv4si, v16qi, v8hi, v4si, v4sf, v2di, ti)
 VAR6 (UNOP, vreinterpretv4sf, v16qi, v8hi, v4si, v4sf, v2di, ti)
 VAR6 (UNOP, vreinterpretv2di, v16qi, v8hi, v4si, v4sf, v2di, ti)
 VAR6 (UNOP, vreinterpretti, v16qi, v8hi, v4si, v4sf, v2di, ti)
-VAR10 (LOAD1, vld1,
-        v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
+VAR12 (LOAD1, vld1,
+        v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
 VAR10 (LOAD1LANE, vld1_lane,
 	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
 VAR10 (LOAD1, vld1_dup,
 	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR10 (STORE1, vst1,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR10 (STORE1LANE, vst1_lane,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR9 (LOAD1, vld2,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (LOAD1LANE, vld2_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR5 (LOAD1, vld2_dup, v8qi, v4hi, v2si, v2sf, di)
-VAR9 (STORE1, vst2,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (STORE1LANE, vst2_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR9 (LOAD1, vld3,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (LOAD1LANE, vld3_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR5 (LOAD1, vld3_dup, v8qi, v4hi, v2si, v2sf, di)
-VAR9 (STORE1, vst3,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (STORE1LANE, vst3_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR9 (LOAD1, vld4,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (LOAD1LANE, vld4_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-VAR5 (LOAD1, vld4_dup, v8qi, v4hi, v2si, v2sf, di)
-VAR9 (STORE1, vst4,
-	v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf)
-VAR7 (STORE1LANE, vst4_lane,
-	v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
+VAR12 (STORE1, vst1,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR12 (STORE1LANE, vst1_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR11 (LOAD1, vld2,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (LOAD1LANE, vld2_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR11 (STORE1, vst2,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (STORE1LANE, vst2_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR11 (LOAD1, vld3,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (LOAD1LANE, vld3_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR11 (STORE1, vst3,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (STORE1LANE, vst3_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR11 (LOAD1, vld4,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (LOAD1LANE, vld4_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
+VAR11 (STORE1, vst4,
+	v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR9 (STORE1LANE, vst4_lane,
+	v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 1e7f3f1..47cc1ee 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -65,20 +65,32 @@
 ;; Integer modes supported by Neon and IWMMXT, except V2DI
 (define_mode_iterator VINTW [V2SI V4HI V8QI V4SI V8HI V16QI])
 
-;; Double-width vector modes.
+;; Double-width vector modes, on which we support arithmetic (no HF!)
 (define_mode_iterator VD [V8QI V4HI V2SI V2SF])
 
+;; Double-width vector modes plus 64-bit elements for vreinterpret + vcreate.
+(define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI])
+
 ;; Double-width vector modes plus 64-bit elements.
-(define_mode_iterator VDX [V8QI V4HI V2SI V2SF DI])
+(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI])
+
+;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane.
+(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF])
 
 ;; Double-width vector modes without floating-point elements.
 (define_mode_iterator VDI [V8QI V4HI V2SI])
 
-;; Quad-width vector modes.
+;; Quad-width vector modes supporting arithmetic (no HF!).
 (define_mode_iterator VQ [V16QI V8HI V4SI V4SF])
 
+;; Quad-width vector modes, including V8HF.
+(define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF])
+
+;; Quad-width vector modes with 16- or 32-bit elements
+(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
+
 ;; Quad-width vector modes plus 64-bit elements.
-(define_mode_iterator VQX [V16QI V8HI V4SI V4SF V2DI])
+(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
 
 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
@@ -111,7 +123,8 @@
 (define_mode_iterator VDQI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
 
 ;; Vector modes, including 64-bit integer elements.
-(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF DI V2DI])
+(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI
+			    V4HF V8HF V2SF V4SF DI V2DI])
 
 ;; Vector modes including 64-bit integer elements, but no floats.
 (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
@@ -366,7 +379,8 @@
 
 ;; Define element mode for each vector mode.
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
-              (V4HI "HI") (V8HI "HI")
+			  (V4HI "HI") (V8HI "HI")
+			  (V4HF "HF") (V8HF "HF")
                           (V2SI "SI") (V4SI "SI")
                           (V2SF "SF") (V4SF "SF")
                           (DI "DI")   (V2DI "DI")])
@@ -383,6 +397,7 @@
 ;; size for structure lane/dup loads and stores.
 (define_mode_attr V_two_elem [(V8QI "HI")   (V16QI "HI")
                               (V4HI "SI")   (V8HI "SI")
+                              (V4HF "SF")   (V8HF "SF")
                               (V2SI "V2SI") (V4SI "V2SI")
                               (V2SF "V2SF") (V4SF "V2SF")
                               (DI "V2DI")   (V2DI "V2DI")])
@@ -390,6 +405,7 @@
 ;; Similar, for three elements.
 (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
                                 (V4HI "BLK") (V8HI "BLK")
+                                (V4HF "BLK") (V8HF "BLK")
                                 (V2SI "BLK") (V4SI "BLK")
                                 (V2SF "BLK") (V4SF "BLK")
                                 (DI "EI")    (V2DI "EI")])
@@ -397,6 +413,7 @@
 ;; Similar, for four elements.
 (define_mode_attr V_four_elem [(V8QI "SI")   (V16QI "SI")
                                (V4HI "V4HI") (V8HI "V4HI")
+                               (V4HF "V4HF") (V8HF "V4HF")
                                (V2SI "V4SI") (V4SI "V4SI")
                                (V2SF "V4SF") (V4SF "V4SF")
                                (DI "OI")     (V2DI "OI")])
@@ -421,7 +438,8 @@
 
 ;; Modes with half the number of equal-sized elements.
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
-              (V4SI  "V2SI") (V4SF "V2SF") (V2DF "DF")
+			  (V8HF "V4HF") (V4SI  "V2SI")
+			  (V4SF "V2SF") (V2DF "DF")
                           (V2DI "DI")])
 
 ;; Same, but lower-case.
@@ -431,8 +449,9 @@
 
 ;; Modes with twice the number of equal-sized elements.
 (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
-                (V2SI "V4SI") (V2SF "V4SF") (DF "V2DF")
-                            (DI "V2DI")])
+			    (V2SI "V4SI") (V4HF "V8HF")
+			    (V2SF "V4SF") (DF "V2DF")
+			    (DI "V2DI")])
 
 ;; Same, but lower-case.
 (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
@@ -454,8 +473,9 @@
 
 ;; Mode of result of comparison operations (and bit-select operand 1).
 (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
-                    (V4HI "V4HI") (V8HI  "V8HI")
+				(V4HI "V4HI") (V8HI  "V8HI")
                                 (V2SI "V2SI") (V4SI  "V4SI")
+				(V4HF "V4HI") (V8HF  "V8HI")
                                 (V2SF "V2SI") (V4SF  "V4SI")
                                 (DI   "DI")   (V2DI  "V2DI")])
 
@@ -492,12 +512,14 @@
 (define_mode_attr V_uf_sclr [(V8QI "u8")  (V16QI "u8")
                  (V4HI "u16") (V8HI "u16")
                              (V2SI "32") (V4SI "32")
+                             (V4HF "u16") (V8HF "u16")
                              (V2SF "32") (V4SF "32")])
 
 (define_mode_attr V_sz_elem [(V8QI "8")  (V16QI "8")
                  (V4HI "16") (V8HI  "16")
                              (V2SI "32") (V4SI  "32")
                              (DI   "64") (V2DI  "64")
+			     (V4HF "16") (V8HF "16")
                  (V2SF "32") (V4SF  "32")])
 
 (define_mode_attr V_elem_ch [(V8QI "b")  (V16QI "b")
@@ -564,6 +586,7 @@
                             (DI   "true") (V2DI  "false")])
 
 (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
+				 (V4HF "4") (V8HF "8")
                                  (V4HI "4") (V8HI "8")
                                  (V2SI "2") (V4SI "4")
                                  (V2SF "2") (V4SF "4")
@@ -607,6 +630,7 @@
 (define_mode_attr q [(V8QI "") (V16QI "_q")
                      (V4HI "") (V8HI "_q")
                      (V2SI "") (V4SI "_q")
+		     (V4HF "") (V8HF "_q")
                      (V2SF "") (V4SF "_q")
                      (DI "")   (V2DI "_q")
                      (DF "")   (V2DF "_q")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 4af74ce..f8d6e74 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -320,11 +320,11 @@
   [(set_attr "type" "neon_load1_all_lanes<q>,neon_from_gp<q>")])
 
 (define_insn "vec_set<mode>_internal"
-  [(set (match_operand:VQ 0 "s_register_operand" "=w,w")
-        (vec_merge:VQ
-          (vec_duplicate:VQ
+  [(set (match_operand:VQ2 0 "s_register_operand" "=w,w")
+        (vec_merge:VQ2
+          (vec_duplicate:VQ2
             (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r"))
-          (match_operand:VQ 3 "s_register_operand" "0,0")
+          (match_operand:VQ2 3 "s_register_operand" "0,0")
           (match_operand:SI 2 "immediate_operand" "i,i")))]
   "TARGET_NEON"
 {
@@ -407,7 +407,7 @@
 (define_insn "vec_extract<mode>"
   [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r")
 	(vec_select:<V_elem>
-          (match_operand:VQ 1 "s_register_operand" "w,w")
+          (match_operand:VQ2 1 "s_register_operand" "w,w")
           (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))]
   "TARGET_NEON"
 {
@@ -2607,7 +2607,7 @@
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(sign_extend:SI
 	  (vec_select:<V_elem>
-	    (match_operand:VQ 1 "s_register_operand" "w")
+	    (match_operand:VQ2 1 "s_register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
   "TARGET_NEON"
 {
@@ -2634,7 +2634,7 @@
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(zero_extend:SI
 	  (vec_select:<V_elem>
-	    (match_operand:VQ 1 "s_register_operand" "w")
+	    (match_operand:VQ2 1 "s_register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
   "TARGET_NEON"
 {
@@ -2774,7 +2774,7 @@
 })
 
 (define_expand "neon_vcreate<mode>"
-  [(match_operand:VDX 0 "s_register_operand" "")
+  [(match_operand:VD_RE 0 "s_register_operand" "")
    (match_operand:DI 1 "general_operand" "")]
   "TARGET_NEON"
 {
@@ -4125,7 +4125,7 @@
 
 (define_expand "neon_vreinterpretv8qi<mode>"
   [(match_operand:V8QI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4134,7 +4134,7 @@
 
 (define_expand "neon_vreinterpretv4hi<mode>"
   [(match_operand:V4HI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4143,7 +4143,7 @@
 
 (define_expand "neon_vreinterpretv2si<mode>"
   [(match_operand:V2SI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4152,7 +4152,7 @@
 
 (define_expand "neon_vreinterpretv2sf<mode>"
   [(match_operand:V2SF 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4161,7 +4161,7 @@
 
 (define_expand "neon_vreinterpretdi<mode>"
   [(match_operand:DI 0 "s_register_operand" "")
-   (match_operand:VDX 1 "s_register_operand" "")]
+   (match_operand:VD_RE 1 "s_register_operand" "")]
   "TARGET_NEON"
 {
   neon_reinterpret (operands[0], operands[1]);
@@ -4420,14 +4420,14 @@
 (define_expand "vec_load_lanesoi<mode>"
   [(set (match_operand:OI 0 "s_register_operand")
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 		   UNSPEC_VLD2))]
   "TARGET_NEON")
 
 (define_insn "neon_vld2<mode>"
   [(set (match_operand:OI 0 "s_register_operand" "=w")
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD2))]
   "TARGET_NEON"
   "vld2.<V_sz_elem>\t%h0, %A1"
@@ -4438,7 +4438,7 @@
         (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:TI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD2_LANE))]
   "TARGET_NEON"
 {
@@ -4463,7 +4463,7 @@
         (unspec:OI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:OI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD2_LANE))]
   "TARGET_NEON"
 {
@@ -4534,14 +4534,14 @@
 (define_expand "vec_store_lanesoi<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand")
 	(unspec:OI [(match_operand:OI 1 "s_register_operand")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST2))]
   "TARGET_NEON")
 
 (define_insn "neon_vst2<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
 	(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
-		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 		   UNSPEC_VST2))]
   "TARGET_NEON"
   "vst2.<V_sz_elem>\t%h1, %A0"
@@ -4553,7 +4553,7 @@
 	(unspec:<V_two_elem>
 	  [(match_operand:TI 1 "s_register_operand" "w")
 	   (match_operand:SI 2 "immediate_operand" "i")
-	   (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+	   (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 	  UNSPEC_VST2_LANE))]
   "TARGET_NEON"
 {
@@ -4578,7 +4578,7 @@
         (unspec:<V_two_elem>
            [(match_operand:OI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST2_LANE))]
   "TARGET_NEON"
 {
@@ -4631,7 +4631,7 @@
 (define_expand "vec_load_lanesci<mode>"
   [(match_operand:CI 0 "s_register_operand")
    (match_operand:CI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vld3<mode> (operands[0], operands[1]));
@@ -4641,7 +4641,7 @@
 (define_expand "neon_vld3<mode>"
   [(match_operand:CI 0 "s_register_operand")
    (match_operand:CI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -4656,7 +4656,7 @@
 (define_insn "neon_vld3qa<mode>"
   [(set (match_operand:CI 0 "s_register_operand" "=w")
         (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3A))]
   "TARGET_NEON"
 {
@@ -4676,7 +4676,7 @@
   [(set (match_operand:CI 0 "s_register_operand" "=w")
         (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
                     (match_operand:CI 2 "s_register_operand" "0")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3B))]
   "TARGET_NEON"
 {
@@ -4697,7 +4697,7 @@
         (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:EI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3_LANE))]
   "TARGET_NEON"
 {
@@ -4724,7 +4724,7 @@
         (unspec:CI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:CI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3_LANE))]
   "TARGET_NEON"
 {
@@ -4804,7 +4804,7 @@
 (define_expand "vec_store_lanesci<mode>"
   [(match_operand:CI 0 "neon_struct_operand")
    (match_operand:CI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vst3<mode> (operands[0], operands[1]));
@@ -4814,7 +4814,7 @@
 (define_expand "neon_vst3<mode>"
   [(match_operand:CI 0 "neon_struct_operand")
    (match_operand:CI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -4829,7 +4829,7 @@
 (define_insn "neon_vst3qa<mode>"
   [(set (match_operand:EI 0 "neon_struct_operand" "=Um")
         (unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST3A))]
   "TARGET_NEON"
 {
@@ -4848,7 +4848,7 @@
 (define_insn "neon_vst3qb<mode>"
   [(set (match_operand:EI 0 "neon_struct_operand" "=Um")
         (unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST3B))]
   "TARGET_NEON"
 {
@@ -4869,7 +4869,7 @@
         (unspec:<V_three_elem>
            [(match_operand:EI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST3_LANE))]
   "TARGET_NEON"
 {
@@ -4896,7 +4896,7 @@
         (unspec:<V_three_elem>
            [(match_operand:CI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST3_LANE))]
   "TARGET_NEON"
 {
@@ -4951,7 +4951,7 @@
 (define_expand "vec_load_lanesxi<mode>"
   [(match_operand:XI 0 "s_register_operand")
    (match_operand:XI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
@@ -4961,7 +4961,7 @@
 (define_expand "neon_vld4<mode>"
   [(match_operand:XI 0 "s_register_operand")
    (match_operand:XI 1 "neon_struct_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -4976,7 +4976,7 @@
 (define_insn "neon_vld4qa<mode>"
   [(set (match_operand:XI 0 "s_register_operand" "=w")
         (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4A))]
   "TARGET_NEON"
 {
@@ -4997,7 +4997,7 @@
   [(set (match_operand:XI 0 "s_register_operand" "=w")
         (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
                     (match_operand:XI 2 "s_register_operand" "0")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4B))]
   "TARGET_NEON"
 {
@@ -5019,7 +5019,7 @@
         (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:OI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4_LANE))]
   "TARGET_NEON"
 {
@@ -5047,7 +5047,7 @@
         (unspec:XI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
                     (match_operand:XI 2 "s_register_operand" "0")
                     (match_operand:SI 3 "immediate_operand" "i")
-                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4_LANE))]
   "TARGET_NEON"
 {
@@ -5132,7 +5132,7 @@
 (define_expand "vec_store_lanesxi<mode>"
   [(match_operand:XI 0 "neon_struct_operand")
    (match_operand:XI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
@@ -5142,7 +5142,7 @@
 (define_expand "neon_vst4<mode>"
   [(match_operand:XI 0 "neon_struct_operand")
    (match_operand:XI 1 "s_register_operand")
-   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -5157,7 +5157,7 @@
 (define_insn "neon_vst4qa<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
         (unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST4A))]
   "TARGET_NEON"
 {
@@ -5177,7 +5177,7 @@
 (define_insn "neon_vst4qb<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
         (unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
-                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST4B))]
   "TARGET_NEON"
 {
@@ -5199,7 +5199,7 @@
         (unspec:<V_four_elem>
            [(match_operand:OI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST4_LANE))]
   "TARGET_NEON"
 {
@@ -5227,7 +5227,7 @@
         (unspec:<V_four_elem>
            [(match_operand:XI 1 "s_register_operand" "w")
             (match_operand:SI 2 "immediate_operand" "i")
-            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+            (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
            UNSPEC_VST4_LANE))]
   "TARGET_NEON"
 {

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 7/16][AArch64] Add basic fp16 support
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (3 preceding siblings ...)
  2015-07-07 12:34 ` [PATCH 1/16][ARM] PR/63870 Add qualifier to check lane bounds in expand Alan Lawrence
@ 2015-07-07 12:35 ` Alan Lawrence
  2015-07-07 12:35 ` [PATCH 5/16][ARM] Add float16x8_t intrinsics Alan Lawrence
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:35 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1127 bytes --]

Same as https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01340.html except that two 
of the tests have been moved into the next patch. (The remaining test is AArch64 
only.)

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (aarch64_fp16_type_node): New.
	(aarch64_init_builtins): Make aarch64_fp16_type_node, use for __fp16.

	* config/aarch64/aarch64-modes.def: Add HFmode.

	* config/aarch64/aarch64.h (TARGET_CPU_CPP_BUILTINS): Define
	__ARM_FP16_FORMAT_IEEE and __ARM_FP16_ARGS. Set bit 1 of __ARM_FP.

	* config/aarch64/aarch64.c (aarch64_init_libfuncs,
	aarch64_promoted_type): New.

	(aarch64_float_const_representable_p): Disable HFmode.
	(aarch64_mangle_type): Mangle half-precision floats to "Dh".
	(TARGET_PROMOTED_TYPE): Define to aarch64_promoted_type.
	(TARGET_INIT_LIBFUNCS): Define to aarch64_init_libfuncs.

	* config/aarch64/aarch64.md (mov<mode>): Include HFmode using GPF_F16.
	(movhf_aarch64, extendhfsf2, extendhfdf2, truncsfhf2, truncdfhf2): New.

	* config/aarch64/iterators.md (GPF_F16): New.


gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/f16_movs_1.c: New test.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 07_aarch64_basic_fp16.patch --]
[-- Type: text/x-patch; name=07_aarch64_basic_fp16.patch, Size: 10205 bytes --]

commit 989af1492bbf268be1ecfae06f3303b90ae514c8
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Tue Dec 2 12:57:39 2014 +0000

    AArch64 1/6: Basic HFmode support (less tests), aarch64_fp16_type_node, patterns, mangling, predefines.
    
    No --fp16-format option.
    
    Disable constants as NYI.

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index ec60955..cfb2dc1 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -439,6 +439,9 @@ static struct aarch64_simd_type_info aarch64_simd_types [] = {
 };
 #undef ENTRY
 
+/* This type is not SIMD-specific; it is the user-visible __fp16.  */
+static tree aarch64_fp16_type_node = NULL_TREE;
+
 static tree aarch64_simd_intOI_type_node = NULL_TREE;
 static tree aarch64_simd_intEI_type_node = NULL_TREE;
 static tree aarch64_simd_intCI_type_node = NULL_TREE;
@@ -849,6 +852,12 @@ aarch64_init_builtins (void)
     = add_builtin_function ("__builtin_aarch64_set_fpsr", ftype_set_fpr,
 			    AARCH64_BUILTIN_SET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
 
+  aarch64_fp16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_fp16_type_node) = 16;
+  layout_type (aarch64_fp16_type_node);
+
+  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, "__fp16");
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
   if (TARGET_CRC32)
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index b17b90d..c30059b 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -36,6 +36,10 @@ CC_MODE (CC_DLTU);
 CC_MODE (CC_DGEU);
 CC_MODE (CC_DGTU);
 
+/* Half-precision floating point for arm_neon.h float16_t.  */
+FLOAT_MODE (HF, 2, 0);
+ADJUST_FLOAT_FORMAT (HF, &ieee_half_format);
+
 /* Vector modes.  */
 VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI.  */
 VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI.  */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 17bae08..f338033 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8339,6 +8339,10 @@ aarch64_mangle_type (const_tree type)
   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
     return "St9__va_list";
 
+  /* Half-precision float.  */
+  if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
+    return "Dh";
+
   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
      builtin types.  */
   if (TYPE_NAME (type) != NULL)
@@ -9578,6 +9582,33 @@ aarch64_start_file (void)
   default_file_start();
 }
 
+static void
+aarch64_init_libfuncs (void)
+{
+   /* Half-precision float operations.  The compiler handles all operations
+     with NULL libfuncs by converting to SFmode.  */
+
+  /* Conversions.  */
+  set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
+  set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
+
+  /* Arithmetic.  */
+  set_optab_libfunc (add_optab, HFmode, NULL);
+  set_optab_libfunc (sdiv_optab, HFmode, NULL);
+  set_optab_libfunc (smul_optab, HFmode, NULL);
+  set_optab_libfunc (neg_optab, HFmode, NULL);
+  set_optab_libfunc (sub_optab, HFmode, NULL);
+
+  /* Comparisons.  */
+  set_optab_libfunc (eq_optab, HFmode, NULL);
+  set_optab_libfunc (ne_optab, HFmode, NULL);
+  set_optab_libfunc (lt_optab, HFmode, NULL);
+  set_optab_libfunc (le_optab, HFmode, NULL);
+  set_optab_libfunc (ge_optab, HFmode, NULL);
+  set_optab_libfunc (gt_optab, HFmode, NULL);
+  set_optab_libfunc (unord_optab, HFmode, NULL);
+}
+
 /* Target hook for c_mode_for_suffix.  */
 static machine_mode
 aarch64_c_mode_for_suffix (char suffix)
@@ -9616,7 +9647,8 @@ aarch64_float_const_representable_p (rtx x)
   if (!CONST_DOUBLE_P (x))
     return false;
 
-  if (GET_MODE (x) == VOIDmode)
+  /* We don't support HFmode constants yet.  */
+  if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
     return false;
 
   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
@@ -11551,6 +11583,14 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
   return true;
 }
 
+/* Implement TARGET_PROMOTED_TYPE to promote float16 to 32 bits.  */
+static tree
+aarch64_promoted_type (const_tree t)
+{
+  if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
+    return float_type_node;
+  return NULL_TREE;
+}
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -11705,6 +11745,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
 #undef TARGET_SCHED_REASSOCIATION_WIDTH
 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
 
+#undef TARGET_PROMOTED_TYPE
+#define TARGET_PROMOTED_TYPE aarch64_promoted_type
+
 #undef TARGET_SECONDARY_RELOAD
 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
 
@@ -11797,6 +11840,8 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
   aarch64_vectorize_vec_perm_const_ok
 
+#undef TARGET_INIT_LIBFUNCS
+#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
 
 #undef TARGET_FIXED_CONDITION_CODE_REGS
 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index a22c6e4..44fe4f9 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -57,7 +57,9 @@
       if (TARGET_FLOAT)                                         \
         {                                                       \
           builtin_define ("__ARM_FEATURE_FMA");                 \
-          builtin_define_with_int_value ("__ARM_FP", 0x0C);     \
+	  builtin_define_with_int_value ("__ARM_FP", 0x0E);     \
+	  builtin_define ("__ARM_FP16_FORMAT_IEEE");		\
+	  builtin_define ("__ARM_FP16_ARGS");			\
         }                                                       \
       if (TARGET_SIMD)                                          \
         {                                                       \
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1efe57c..6eafa2c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -976,8 +976,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF 0 "nonimmediate_operand" "")
-	(match_operand:GPF 1 "general_operand" ""))]
+  [(set (match_operand:GPF_F16 0 "nonimmediate_operand" "")
+	(match_operand:GPF_F16 1 "general_operand" ""))]
   ""
   "
     if (!TARGET_FLOAT)
@@ -991,6 +991,26 @@
   "
 )
 
+(define_insn "*movhf_aarch64"
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w, ?r,w,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "?rY, w,w,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
+    || register_operand (operands[1], HFmode))"
+  "@
+   mov\\t%0.h[0], %w1
+   umov\\t%w0, %1.h[0]
+   mov\\t%0.h[0], %1.h[0]
+   ldr\\t%h0, %1
+   str\\t%h1, %0
+   ldrh\\t%w0, %1
+   strh\\t%w1, %0
+   mov\\t%w0, %w1"
+  [(set_attr "type" "neon_from_gp,neon_to_gp,fmov,\
+                     f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,yes,yes,*,*,*,*,*")
+   (set_attr "fp"   "*,*,*,yes,yes,*,*,*")]
+)
+
 (define_insn "*movsf_aarch64"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
 	(match_operand:SF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
@@ -4088,6 +4108,22 @@
   [(set_attr "type" "f_cvt")]
 )
 
+(define_insn "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+        (float_extend:SF (match_operand:HF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%s0, %h1"
+  [(set_attr "type" "f_cvt")]
+)
+
+(define_insn "extendhfdf2"
+  [(set (match_operand:DF 0 "register_operand" "=w")
+        (float_extend:DF (match_operand:HF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%d0, %h1"
+  [(set_attr "type" "f_cvt")]
+)
+
 (define_insn "truncdfsf2"
   [(set (match_operand:SF 0 "register_operand" "=w")
         (float_truncate:SF (match_operand:DF 1 "register_operand" "w")))]
@@ -4096,6 +4132,22 @@
   [(set_attr "type" "f_cvt")]
 )
 
+(define_insn "truncsfhf2"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+        (float_truncate:HF (match_operand:SF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%h0, %s1"
+  [(set_attr "type" "f_cvt")]
+)
+
+(define_insn "truncdfhf2"
+  [(set (match_operand:HF 0 "register_operand" "=w")
+        (float_truncate:HF (match_operand:DF 1 "register_operand" "w")))]
+  "TARGET_FLOAT"
+  "fcvt\\t%h0, %d1"
+  [(set_attr "type" "f_cvt")]
+)
+
 (define_insn "fix_trunc<GPF:mode><GPI:mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
         (fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 498358a..a6b351b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -38,6 +38,9 @@
 ;; Iterator for General Purpose Floating-point registers (32- and 64-bit modes)
 (define_mode_iterator GPF [SF DF])
 
+;; Iterator for General Purpose Float regs, inc float16_t.
+(define_mode_iterator GPF_F16 [HF SF DF])
+
 ;; Integer vector modes.
 (define_mode_iterator VDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/f16_movs_1.c b/gcc/testsuite/gcc.target/aarch64/f16_movs_1.c
new file mode 100644
index 0000000..6cb8086
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/f16_movs_1.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-fno-inline -O2" } */
+
+#include <arm_neon.h>
+
+__fp16
+func2 (__fp16 a, __fp16 b)
+{
+  return b;
+}
+
+int
+main (int argc, char **argv)
+{
+  __fp16 array[16];
+  int i;
+
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = i;
+
+  array[0] = func2 (array[1], array[2]);
+
+  __builtin_printf ("%f\n", array[0]); /* { dg-output "2.0" } */
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 11/16][AArch64] Implement vcvt_{,high_}f16_f32
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (9 preceding siblings ...)
  2015-07-07 12:36 ` [PATCH 8/16][ARM/AArch64 Testsuite] Add basic fp16 tests Alan Lawrence
@ 2015-07-07 12:36 ` Alan Lawrence
  2015-07-07 12:37 ` [PATCH 12/16][AArch64] vreinterpret(q?), vget_(low|high), vld1(q?)_dup Alan Lawrence
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:36 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 768 bytes --]

This comes from https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01343.html but the 
other/unrelated intrinsics have moved into the next patch.

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (aarch64_float_truncate_lo_v2sf):
	Reparameterize to...
	(aarch64_float_truncate_lo_<mode>): ...this, for both V2SF and V4HF.
	(aarch64_float_truncate_hi_v4sf): Reparameterize to...
	(aarch64_float_truncate_hi_<Vdbl>): ...this, for both V4SF and V8HF.

	* config/aarch64/aarch64-simd-builtins.def (float_truncate_hi_): Add
	v8hf variant.
	(float_truncate_lo_): Use BUILTIN_VDF iterator.

	* config/aarch64/arm_neon.h (vcvt_f16_f32, vcvt_high_f16_f32): New.

	* config/aarch64/iterators.md (VDF, Vdtype): New.
	(VWIDE, Vmwtype): Add cases for V4HF and V2SF.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 11_aarch64_vcvt_f16_f32.patch --]
[-- Type: text/x-patch; name=11_aarch64_vcvt_f16_f32.patch, Size: 4728 bytes --]

commit 5007fafedc8469ab645edfe65fbf41f75fc74750
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Tue Dec 2 18:30:05 2014 +0000

    AArch64 4/N v2: float_truncate_lo/hi

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4dd2bc7..8bcab72 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -363,9 +363,10 @@
 
   VAR1 (UNOP, vec_unpacks_hi_, 10, v4sf)
   VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
+  VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
 
   VAR1 (UNOP, float_extend_lo_, 0, v2df)
-  VAR1 (UNOP, float_truncate_lo_, 0, v2sf)
+  BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
   BUILTIN_VALL_F16 (LOAD1, ld1, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 5cc45ed..2dc54e1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1726,23 +1726,23 @@
 
 ;; Float narrowing operations.
 
-(define_insn "aarch64_float_truncate_lo_v2sf"
-  [(set (match_operand:V2SF 0 "register_operand" "=w")
-      (float_truncate:V2SF
-	(match_operand:V2DF 1 "register_operand" "w")))]
+(define_insn "aarch64_float_truncate_lo_<mode>"
+  [(set (match_operand:VDF 0 "register_operand" "=w")
+      (float_truncate:VDF
+	(match_operand:<VWIDE> 1 "register_operand" "w")))]
   "TARGET_SIMD"
-  "fcvtn\\t%0.2s, %1.2d"
+  "fcvtn\\t%0.<Vtype>, %1<Vmwtype>"
   [(set_attr "type" "neon_fp_cvt_narrow_d_q")]
 )
 
-(define_insn "aarch64_float_truncate_hi_v4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "=w")
-    (vec_concat:V4SF
-      (match_operand:V2SF 1 "register_operand" "0")
-      (float_truncate:V2SF
-	(match_operand:V2DF 2 "register_operand" "w"))))]
+(define_insn "aarch64_float_truncate_hi_<Vdbl>"
+  [(set (match_operand:<VDBL> 0 "register_operand" "=w")
+    (vec_concat:<VDBL>
+      (match_operand:VDF 1 "register_operand" "0")
+      (float_truncate:VDF
+	(match_operand:<VWIDE> 2 "register_operand" "w"))))]
   "TARGET_SIMD"
-  "fcvtn2\\t%0.4s, %2.2d"
+  "fcvtn2\\t%0.<Vdtype>, %2<Vmwtype>"
   [(set_attr "type" "neon_fp_cvt_narrow_d_q")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d61e619..b915754 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -5726,12 +5726,8 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-/* vcvt_f16_f32 not supported */
-
 /* vcvt_f32_f16 not supported */
 
-/* vcvt_high_f16_f32 not supported */
-
 /* vcvt_high_f32_f16 not supported */
 
 #define vcvt_n_f32_s32(a, b)                                            \
@@ -13098,6 +13094,18 @@ vcntq_u8 (uint8x16_t __a)
 
 /* vcvt (double -> float).  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcvt_f16_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcvt_f32_f64 (float64x2_t __a)
 {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 96920cf..f6094b1 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -41,6 +41,9 @@
 ;; Iterator for General Purpose Float regs, inc float16_t.
 (define_mode_iterator GPF_F16 [HF SF DF])
 
+;; Double vector modes.
+(define_mode_iterator VDF [V2SF V4HF])
+
 ;; Integer vector modes.
 (define_mode_iterator VDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
 
@@ -452,6 +455,9 @@
 			(SI   "V2SI")  (DI   "V2DI")
 			(DF   "V2DF")])
 
+;; Register suffix for double-length mode.
+(define_mode_attr Vdtype [(V4HF "8h") (V2SF "4s")])
+
 ;; Double modes of vector modes (lower case).
 (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
 			(V4HF "v8hf")
@@ -485,7 +491,8 @@
 (define_mode_attr VWIDE [(V8QI "V8HI") (V4HI "V4SI")
 			 (V2SI "V2DI") (V16QI "V8HI") 
 			 (V8HI "V4SI") (V4SI "V2DI")
-			 (HI "SI")     (SI "DI")]
+			 (HI "SI")     (SI "DI")
+			 (V4HF "V4SF") (V2SF "V2DF")]
 
 )
 
@@ -498,6 +505,7 @@
 (define_mode_attr Vmwtype [(V8QI ".8h") (V4HI ".4s")
 			   (V2SI ".2d") (V16QI ".8h") 
 			   (V8HI ".4s") (V4SI ".2d")
+			   (V4HF ".4s") (V2SF ".2d")
 			   (SI   "")    (HI   "")])
 
 ;; Lower part register suffixes for VQW.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 8/16][ARM/AArch64 Testsuite] Add basic fp16 tests
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (8 preceding siblings ...)
  2015-07-07 12:36 ` [PATCH 10/16][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate Alan Lawrence
@ 2015-07-07 12:36 ` Alan Lawrence
  2015-07-07 12:36 ` [PATCH 11/16][AArch64] Implement vcvt_{,high_}f16_f32 Alan Lawrence
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:36 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 414 bytes --]

These were originally part of 
https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01340.html but I have moved into 
their own subdirectory and adapted them to execute on ARM also (as per 
https://gcc.gnu.org/ml/gcc-patches/2015-05/msg00656.html)

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/fp16/fp16.exp: New.
	* gcc.target/aarch64/fp16/f16_convs_1.c: New.
	* gcc.target/aarch64/fp16/f16_convs_2.c: New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 08_fp16_tests.patch --]
[-- Type: text/x-patch; name=08_fp16_tests.patch, Size: 4278 bytes --]

commit bc5045c0d3dd34b8cb94910281384f9ab9880325
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Thu May 7 10:08:12 2015 +0100

    (ARM+AArch64) Add gcc.target/aarch64/fp16, f16_conv_[12].c tests

diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_1.c b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_1.c
new file mode 100644
index 0000000..a1c95fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_1.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-mfp16-format=ieee" {target "arm*-*-*"} } */
+
+extern void abort (void);
+
+#define EPSILON 0.0001
+
+int
+main (int argc, char **argv)
+{
+  float f1 = 3.14159f;
+  float f2 = 2.718f;
+  /* This 'assembler' statement should be portable between ARM and AArch64.  */
+  asm volatile ("" : : : "memory");
+  __fp16 in1 = f1;
+  __fp16 in2 = f2;
+
+  /* Do the addition on __fp16's (implicitly converts both operands to
+     float32, adds, converts back to f16, then we convert back to f32).  */
+  __fp16 res1 = in1 + in2;
+  asm volatile ("" : : : "memory");
+  float f_res_1 = res1;
+
+  /* Do the addition on float32's (we convert both operands to f32, and add,
+     as above, but skip the final conversion f32 -> f16 -> f32).  */
+  float f1a = in1;
+  float f2a = in2;
+  float f_res_2 = f1a + f2a;
+
+  if (__builtin_fabs (f_res_2 - f_res_1) > EPSILON)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_2.c b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_2.c
new file mode 100644
index 0000000..6aa3e59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16/f16_convs_2.c
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-mfp16-format=ieee" {target "arm*-*-*"} } */
+
+extern void abort (void);
+
+#define EPSILON 0.0001
+
+int
+main (int argc, char **argv)
+{
+  int i1 = 3;
+  int i2 = 2;
+  /*  This 'assembler' should be portable across ARM and AArch64.  */
+  asm volatile ("" : : : "memory");
+
+  __fp16 in1 = i1;
+  __fp16 in2 = i2;
+
+  /* Do the addition on __fp16's (implicitly converts both operands to
+     float32, adds, converts back to f16, then we convert to int).  */
+  __fp16 res1 = in1 + in2;
+  asm volatile ("" : : : "memory");
+  int result1 = res1;
+
+  /* Do the addition on int's (we convert both operands directly to int, add,
+     and we're done).  */
+  int result2 = ((int) in1) + ((int) in2);
+
+  if (__builtin_abs (result2 - result1) > EPSILON)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/fp16.exp b/gcc/testsuite/gcc.target/aarch64/fp16/fp16.exp
new file mode 100644
index 0000000..7dc8d65
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16/fp16.exp
@@ -0,0 +1,43 @@
+# Tests of 16-bit floating point (__fp16), for both ARM and AArch64.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an ARM or AArch64 target.
+if {![istarget arm*-*-*]
+    && ![istarget aarch64*-*-*]} then {
+  return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
+}
+
+# Initialize `dg'.
+dg-init
+
+# Main loop.
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cC\]]] \
+	"" $DEFAULT_CFLAGS
+
+# All done.
+dg-finish

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 10/16][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (7 preceding siblings ...)
  2015-07-07 12:36 ` [PATCH 9/16][AArch64] Add support for float16x{4,8}_t vectors/builtins Alan Lawrence
@ 2015-07-07 12:36 ` Alan Lawrence
  2015-07-07 12:36 ` [PATCH 8/16][ARM/AArch64 Testsuite] Add basic fp16 tests Alan Lawrence
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:36 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 65 bytes --]

As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01342.html

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 10_aarch64_vcreate_et_al.patch --]
[-- Type: text/x-patch; name=10_aarch64_vcreate_et_al.patch, Size: 30201 bytes --]

commit ef719e5d3d6eccc5cf621851283b7c0ba1a9ee6c
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Tue Aug 5 17:52:28 2014 +0100

    AArch64 3/N: v(create|combine|v(ld|st|ld...dup/lane|st...lane)[234](q?))_f16; tests vldN{,_lane,_dup} inc bigendian. Add __builtin_aarch64_simd_hf.
    
    Fix some casts, to ..._hf not ..._sf !

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index a6c3377..5367ba6 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -300,6 +300,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define VAR12(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR11 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, MAP, L)
+#define VAR13(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, MAP, M)
+#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR1 (T, X, MAP, N)
 
 #include "aarch64-builtin-iterators.h"
 
@@ -377,6 +383,7 @@ const char *aarch64_scalar_builtin_types[] = {
   "__builtin_aarch64_simd_qi",
   "__builtin_aarch64_simd_hi",
   "__builtin_aarch64_simd_si",
+  "__builtin_aarch64_simd_hf",
   "__builtin_aarch64_simd_sf",
   "__builtin_aarch64_simd_di",
   "__builtin_aarch64_simd_df",
@@ -664,6 +671,8 @@ aarch64_init_simd_builtin_scalar_types (void)
 					     "__builtin_aarch64_simd_qi");
   (*lang_hooks.types.register_builtin_type) (intHI_type_node,
 					     "__builtin_aarch64_simd_hi");
+  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node,
+					     "__builtin_aarch64_simd_hf");
   (*lang_hooks.types.register_builtin_type) (intSI_type_node,
 					     "__builtin_aarch64_simd_si");
   (*lang_hooks.types.register_builtin_type) (float_type_node,
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ccf063a..bbf5230 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1063,6 +1063,9 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 	case V2SImode:
 	  gen = gen_aarch64_simd_combinev2si;
 	  break;
+	case V4HFmode:
+	  gen = gen_aarch64_simd_combinev4hf;
+	  break;
 	case V2SFmode:
 	  gen = gen_aarch64_simd_combinev2sf;
 	  break;
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 7425485..d61e619 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -153,6 +153,16 @@ typedef struct uint64x2x2_t
   uint64x2_t val[2];
 } uint64x2x2_t;
 
+typedef struct float16x4x2_t
+{
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t
+{
+  float16x8_t val[2];
+} float16x8x2_t;
+
 typedef struct float32x2x2_t
 {
   float32x2_t val[2];
@@ -273,6 +283,16 @@ typedef struct uint64x2x3_t
   uint64x2_t val[3];
 } uint64x2x3_t;
 
+typedef struct float16x4x3_t
+{
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t
+{
+  float16x8_t val[3];
+} float16x8x3_t;
+
 typedef struct float32x2x3_t
 {
   float32x2_t val[3];
@@ -393,6 +413,16 @@ typedef struct uint64x2x4_t
   uint64x2_t val[4];
 } uint64x2x4_t;
 
+typedef struct float16x4x4_t
+{
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t
+{
+  float16x8_t val[4];
+} float16x8x4_t;
+
 typedef struct float32x2x4_t
 {
   float32x2_t val[4];
@@ -2644,6 +2674,12 @@ vcreate_s64 (uint64_t __a)
   return (int64x1_t) {__a};
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcreate_f16 (uint64_t __a)
+{
+  return (float16x4_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcreate_f32 (uint64_t __a)
 {
@@ -4780,6 +4816,12 @@ vcombine_s64 (int64x1_t __a, int64x1_t __b)
   return __builtin_aarch64_combinedi (__a[0], __b[0]);
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcombine_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_combinev4hf (__a, __b);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vcombine_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -9908,7 +9950,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
    +------+----+----+----+----+
    |uint  | Y  | Y  | N  | N  |
    +------+----+----+----+----+
-   |float | -  | -  | N  | N  |
+   |float | -  | Y  | N  | N  |
    +------+----+----+----+----+
    |poly  | Y  | Y  | -  | -  |
    +------+----+----+----+----+
@@ -9922,7 +9964,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
    +------+----+----+----+----+
    |uint  | Y  | Y  | Y  | Y  |
    +------+----+----+----+----+
-   |float | -  | -  | Y  | Y  |
+   |float | -  | Y  | Y  | Y  |
    +------+----+----+----+----+
    |poly  | Y  | Y  | -  | -  |
    +------+----+----+----+----+
@@ -9936,7 +9978,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
    +------+----+----+----+----+
    |uint  | Y  | N  | N  | Y  |
    +------+----+----+----+----+
-   |float | -  | -  | N  | Y  |
+   |float | -  | N  | N  | Y  |
    +------+----+----+----+----+
    |poly  | Y  | N  | -  | -  |
    +------+----+----+----+----+
@@ -9952,6 +9994,7 @@ __STRUCTN (int, 8, 2)
 __STRUCTN (int, 16, 2)
 __STRUCTN (uint, 8, 2)
 __STRUCTN (uint, 16, 2)
+__STRUCTN (float, 16, 2)
 __STRUCTN (poly, 8, 2)
 __STRUCTN (poly, 16, 2)
 /* 3-element structs.  */
@@ -9963,6 +10006,7 @@ __STRUCTN (uint, 8, 3)
 __STRUCTN (uint, 16, 3)
 __STRUCTN (uint, 32, 3)
 __STRUCTN (uint, 64, 3)
+__STRUCTN (float, 16, 3)
 __STRUCTN (float, 32, 3)
 __STRUCTN (float, 64, 3)
 __STRUCTN (poly, 8, 3)
@@ -10000,6 +10044,8 @@ vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
 				     __ptr, __o, __c);			     \
 }
 
+__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v8hf, hf, f16,
+		 float16x8_t)
 __ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v4sf, sf, f32,
 		 float32x4_t)
 __ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, v2df, df, f64,
@@ -10032,6 +10078,7 @@ vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
 				    __ptr, __temp.__o, __c);		    \
 }
 
+__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
 __ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
 __ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
 __ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
@@ -10073,6 +10120,8 @@ vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
 				     __ptr, __o, __c);			     \
 }
 
+__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v8hf, hf, f16,
+		 float16x8_t)
 __ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v4sf, sf, f32,
 		 float32x4_t)
 __ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, v2df, df, f64,
@@ -10105,6 +10154,7 @@ vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
 				    __ptr, __temp.__o, __c);		    \
 }
 
+__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
 __ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
 __ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
 __ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
@@ -10151,6 +10201,8 @@ vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
 				     __ptr, __o, __c);			     \
 }
 
+__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v8hf, hf, f16,
+		 float16x8_t)
 __ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v4sf, sf, f32,
 		 float32x4_t)
 __ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, v2df, df, f64,
@@ -10183,6 +10235,7 @@ vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
 				    __ptr, __temp.__o, __c);		    \
 }
 
+__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
 __ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
 __ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
 __ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
@@ -15239,6 +15292,17 @@ vld2_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_f16 (const float16_t * __a)
+{
+  float16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_f32 (const float32_t * __a)
 {
@@ -15360,6 +15424,17 @@ vld2q_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_f16 (const float16_t * __a)
+{
+  float16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_f32 (const float32_t * __a)
 {
@@ -15514,6 +15589,18 @@ vld3_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_f16 (const float16_t * __a)
+{
+  float16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_f32 (const float32_t * __a)
 {
@@ -15646,6 +15733,18 @@ vld3q_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_f16 (const float16_t * __a)
+{
+  float16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_f32 (const float32_t * __a)
 {
@@ -15813,6 +15912,19 @@ vld4_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_f16 (const float16_t * __a)
+{
+  float16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4hf (__a);
+  ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
+  ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_f32 (const float32_t * __a)
 {
@@ -15956,6 +16068,19 @@ vld4q_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_f16 (const float16_t * __a)
+{
+  float16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8hf (__a);
+  ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
+  ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
+  ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_f32 (const float32_t * __a)
 {
@@ -16017,6 +16142,18 @@ vld2_dup_s32 (const int32_t * __a)
   return ret;
 }
 
+
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_f16 (const float16_t * __a)
+{
+  float16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vld2_dup_f32 (const float32_t * __a)
 {
@@ -16226,6 +16363,17 @@ vld2q_dup_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_dup_f16 (const float16_t * __a)
+{
+  float16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_dup_f32 (const float32_t * __a)
 {
@@ -16380,6 +16528,18 @@ vld3_dup_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_f16 (const float16_t * __a)
+{
+  float16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 vld3_dup_f32 (const float32_t * __a)
 {
@@ -16512,6 +16672,18 @@ vld3q_dup_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_dup_f16 (const float16_t * __a)
+{
+  float16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_dup_f32 (const float32_t * __a)
 {
@@ -16679,6 +16851,19 @@ vld4_dup_u32 (const uint32_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_f16 (const float16_t * __a)
+{
+  float16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
+  ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 vld4_dup_f32 (const float32_t * __a)
 {
@@ -16822,6 +17007,19 @@ vld4q_dup_u64 (const uint64_t * __a)
   return ret;
 }
 
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_dup_f16 (const float16_t * __a)
+{
+  float16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
+  ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_dup_f32 (const float32_t * __a)
 {
@@ -16874,6 +17072,8 @@ vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
   return __b;								   \
 }
 
+__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v8hf,
+		 hf, f16, float16x8_t)
 __LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
 		 sf, f32, float32x4_t)
 __LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
@@ -16918,6 +17118,7 @@ vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
   return ret;								   \
 }
 
+__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
 __LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
 __LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
@@ -16965,6 +17166,8 @@ vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
   return __b;								   \
 }
 
+__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v8hf,
+		 hf, f16, float16x8_t)
 __LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
 		 sf, f32, float32x4_t)
 __LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
@@ -17011,6 +17214,7 @@ vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
   return ret;								   \
 }
 
+__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
 __LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
 __LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
@@ -17066,6 +17270,8 @@ vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
 
 /* vld4q_lane */
 
+__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v8hf,
+		 hf, f16, float16x8_t)
 __LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
 		 sf, f32, float32x4_t)
 __LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
@@ -17114,6 +17320,7 @@ vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
   return ret;								   \
 }
 
+__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
 __LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
 __LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
@@ -22474,6 +22681,18 @@ vst2_u32 (uint32_t * __a, uint32x2x2_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_f16 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st2v4hf (__a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2_f32 (float32_t * __a, float32x2x2_t val)
 {
   __builtin_aarch64_simd_oi __o;
@@ -22576,6 +22795,15 @@ vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_f16 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st2v8hf (__a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f32 (float32_t * __a, float32x4x2_t val)
 {
   __builtin_aarch64_simd_oi __o;
@@ -22748,6 +22976,20 @@ vst3_u32 (uint32_t * __a, uint32x2x3_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_f16 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3_f32 (float32_t * __a, float32x2x3_t val)
 {
   __builtin_aarch64_simd_ci __o;
@@ -22862,6 +23104,16 @@ vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_f16 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f32 (float32_t * __a, float32x4x3_t val)
 {
   __builtin_aarch64_simd_ci __o;
@@ -23058,6 +23310,22 @@ vst4_u32 (uint32_t * __a, uint32x2x4_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_f16 (float16_t * __a, float16x4x4_t val)
+{
+  __builtin_aarch64_simd_xi __o;
+  float16x8x4_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
+  __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4_f32 (float32_t * __a, float32x2x4_t val)
 {
   __builtin_aarch64_simd_xi __o;
@@ -23184,6 +23452,17 @@ vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_f16 (float16_t * __a, float16x8x4_t val)
+{
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
+  __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f32 (float32_t * __a, float32x4x4_t val)
 {
   __builtin_aarch64_simd_xi __o;
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a7aaa52..96920cf 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -113,7 +113,7 @@
 
 ;; All vector modes and DI and DF.
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
-			       V2DI V2SF V4SF V2DF DI DF])
+			       V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
 
 ;; Vector modes for Integer reduction across lanes.
 (define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI])
@@ -134,7 +134,7 @@
 (define_mode_iterator VQW [V16QI V8HI V4SI])
 
 ;; Double vector modes for combines.
-(define_mode_iterator VDC [V8QI V4HI V2SI V2SF DI DF])
+(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF])
 
 ;; Vector modes except double int.
 (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
@@ -364,7 +364,8 @@
                          (V2SI "2s") (V4SI  "4s")
                          (DI   "1d") (DF    "1d")
                          (V2DI "2d") (V2SF "2s")
-			 (V4SF "4s") (V2DF "2d")])
+			 (V4SF "4s") (V2DF "2d")
+			 (V4HF "4h") (V8HF "8h")])
 
 (define_mode_attr Vrevsuff [(V4HI "16") (V8HI "16") (V2SI "32")
                             (V4SI "32") (V2DI "64")])
@@ -390,7 +391,8 @@
 (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
 			  (V4HI "h") (V8HI  "h")
                           (V2SI "s") (V4SI  "s")
-			  (V2DI "d") (V2SF  "s")
+			  (V2DI "d") (V4HF "h")
+			  (V8HF "h") (V2SF  "s")
 			  (V4SF "s") (V2DF  "d")
 			  (SF   "s") (DF  "d")
 			  (QI "b")   (HI "h")
@@ -400,7 +402,8 @@
 (define_mode_attr Vbtype [(V8QI "8b")  (V16QI "16b")
 			  (V4HI "8b") (V8HI  "16b")
 			  (V2SI "8b") (V4SI  "16b")
-			  (V2DI "16b") (V2SF  "8b")
+			  (V2DI "16b") (V4HF "8b")
+			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
 			  (SI   "8b")])
@@ -451,6 +454,7 @@
 
 ;; Double modes of vector modes (lower case).
 (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
+			(V4HF "v8hf")
 			(V2SI "v4si")  (V2SF "v4sf")
 			(SI   "v2si")  (DI   "v2di")
 			(DF   "v2df")])
@@ -525,6 +529,7 @@
 				(V4HI "V4HI") (V8HI  "V8HI")
 				(V2SI "V2SI") (V4SI  "V4SI")
 				(DI   "DI")   (V2DI  "V2DI")
+				(V4HF "V4HI") (V8HF  "V8HI")
 				(V2SF "V2SI") (V4SF  "V4SI")
 				(V2DF "V2DI") (DF    "DI")
 				(SF   "SI")])
@@ -534,6 +539,7 @@
 				(V4HI "v4hi") (V8HI  "v8hi")
 				(V2SI "v2si") (V4SI  "v4si")
 				(DI   "di")   (V2DI  "v2di")
+				(V4HF "v4hi") (V8HF  "v8hi")
 				(V2SF "v2si") (V4SF  "v4si")
 				(V2DF "v2di") (DF    "di")
 				(SF   "si")])
diff --git a/gcc/testsuite/gcc.target/aarch64/vldN_1.c b/gcc/testsuite/gcc.target/aarch64/vldN_1.c
index b64de16..caac94f 100644
--- a/gcc/testsuite/gcc.target/aarch64/vldN_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vldN_1.c
@@ -39,6 +39,7 @@ VARIANT (int32, 2, STRUCT, _s32)	\
 VARIANT (int64, 1, STRUCT, _s64)	\
 VARIANT (poly8, 8, STRUCT, _p8)		\
 VARIANT (poly16, 4, STRUCT, _p16)	\
+VARIANT (float16, 4, STRUCT, _f16)	\
 VARIANT (float32, 2, STRUCT, _f32)	\
 VARIANT (float64, 1, STRUCT, _f64)	\
 VARIANT (uint8, 16, STRUCT, q_u8)	\
@@ -51,6 +52,7 @@ VARIANT (int32, 4, STRUCT, q_s32)	\
 VARIANT (int64, 2, STRUCT, q_s64)	\
 VARIANT (poly8, 16, STRUCT, q_p8)	\
 VARIANT (poly16, 8, STRUCT, q_p16)	\
+VARIANT (float16, 8, STRUCT, q_f16)	\
 VARIANT (float32, 4, STRUCT, q_f32)	\
 VARIANT (float64, 2, STRUCT, q_f64)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c b/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c
index 9af0565..68c3fc3 100644
--- a/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vldN_dup_1.c
@@ -16,6 +16,7 @@ VARIANT (int32, , 2, _s32, STRUCT)	\
 VARIANT (int64, , 1, _s64, STRUCT)	\
 VARIANT (poly8, , 8, _p8, STRUCT)	\
 VARIANT (poly16, , 4, _p16, STRUCT)	\
+VARIANT (float16, , 4, _f16, STRUCT)	\
 VARIANT (float32, , 2, _f32, STRUCT)	\
 VARIANT (float64, , 1, _f64, STRUCT)	\
 VARIANT (uint8, q, 16, _u8, STRUCT)	\
@@ -28,6 +29,7 @@ VARIANT (int32, q, 4, _s32, STRUCT)	\
 VARIANT (int64, q, 2, _s64, STRUCT)	\
 VARIANT (poly8, q, 16, _p8, STRUCT)	\
 VARIANT (poly16, q, 8, _p16, STRUCT)	\
+VARIANT (float16, q, 8, _f16, STRUCT)	\
 VARIANT (float32, q, 4, _f32, STRUCT)	\
 VARIANT (float64, q, 2, _f64, STRUCT)
 
@@ -74,6 +76,7 @@ main (int argc, char **argv)
   int64_t *int64_data = (int64_t *)uint64_data;
   poly8_t poly8_data[4] = { 0, 7, 13, 18, };
   poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
+  float16_t float16_data[4] = { 1.0625, 3.125, 0.03125, 7.75 };
   float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c
index 13ab454..6837a11 100644
--- a/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vldN_lane_1.c
@@ -16,6 +16,7 @@ VARIANT (int32, , 2, _s32, 0, STRUCT)	\
 VARIANT (int64, , 1, _s64, 0, STRUCT)	\
 VARIANT (poly8, , 8, _p8, 7, STRUCT)	\
 VARIANT (poly16, , 4, _p16, 1, STRUCT)	\
+VARIANT (float16, , 4, _f16, 3, STRUCT)	\
 VARIANT (float32, , 2, _f32, 1, STRUCT)	\
 VARIANT (float64, , 1, _f64, 0, STRUCT)	\
 VARIANT (uint8, q, 16, _u8, 14, STRUCT)	\
@@ -28,6 +29,7 @@ VARIANT (int32, q, 4, _s32, 2, STRUCT)	\
 VARIANT (int64, q, 2, _s64, 1, STRUCT)	\
 VARIANT (poly8, q, 16, _p8, 12, STRUCT)	\
 VARIANT (poly16, q, 8, _p16, 5, STRUCT)	\
+VARIANT (float16, q, 8, _f16, 7, STRUCT)\
 VARIANT (float32, q, 4, _f32, 1, STRUCT)\
 VARIANT (float64, q, 2, _f64, 0, STRUCT)
 
@@ -71,7 +73,7 @@ main (int argc, char **argv)
 {
   /* Original data for all vector formats.  */
   uint64_t orig_data[8] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL,
-			   0x012389ab4567cdefULL, 0xfeeddadacafe0431ULL,
+			   0x012389ab4567cdefULL, 0xdeeddadacafe0431ULL,
 			   0x1032547698badcfeULL, 0xbadbadbadbad0badULL,
 			   0x0102030405060708ULL, 0x0f0e0d0c0b0a0908ULL};
 
@@ -87,6 +89,7 @@ main (int argc, char **argv)
   int64_t *int64_data = (int64_t *)uint64_data;
   poly8_t poly8_data[4] = { 0, 7, 13, 18, };
   poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
+  float16_t float16_data[4] = { 0.8125, 7.5, 19, 0.046875 };
   float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 9/16][AArch64] Add support for float16x{4,8}_t vectors/builtins
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (6 preceding siblings ...)
  2015-07-07 12:35 ` [PATCH 6/16][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low/high, vcombine Alan Lawrence
@ 2015-07-07 12:36 ` Alan Lawrence
  2015-07-07 12:36 ` [PATCH 10/16][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate Alan Lawrence
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:36 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 61 bytes --]

As https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01341.html

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 09_aarch64_float16_vectors.patch --]
[-- Type: text/x-patch; name=09_aarch64_float16_vectors.patch, Size: 26493 bytes --]

commit 49cb53a94a44fcda845c3f6ef11e88f9be458aad
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Tue Dec 2 13:08:15 2014 +0000

    AArch64 2/N: Vector/__builtin basics: define+support types, movs, test ABI.
    
    Patterns, builtins, intrinsics for {ld1,st1}{,_lane},v{g,s}et_lane. Tests: vld1-vst1_1, vset_lane_1, vld1_lane.c

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index cfb2dc1..a6c3377 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -66,6 +66,7 @@
 
 #define v8qi_UP  V8QImode
 #define v4hi_UP  V4HImode
+#define v4hf_UP  V4HFmode
 #define v2si_UP  V2SImode
 #define v2sf_UP  V2SFmode
 #define v1df_UP  V1DFmode
@@ -73,6 +74,7 @@
 #define df_UP    DFmode
 #define v16qi_UP V16QImode
 #define v8hi_UP  V8HImode
+#define v8hf_UP  V8HFmode
 #define v4si_UP  V4SImode
 #define v4sf_UP  V4SFmode
 #define v2di_UP  V2DImode
@@ -523,6 +525,8 @@ aarch64_simd_builtin_std_type (enum machine_mode mode,
       return aarch64_simd_intCI_type_node;
     case XImode:
       return aarch64_simd_intXI_type_node;
+    case HFmode:
+      return aarch64_fp16_type_node;
     case SFmode:
       return float_type_node;
     case DFmode:
@@ -607,6 +611,8 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Poly64x2_t].eltype = aarch64_simd_types[Poly64_t].itype;
 
   /* Continue with standard types.  */
+  aarch64_simd_types[Float16x4_t].eltype = aarch64_fp16_type_node;
+  aarch64_simd_types[Float16x8_t].eltype = aarch64_fp16_type_node;
   aarch64_simd_types[Float32x2_t].eltype = float_type_node;
   aarch64_simd_types[Float32x4_t].eltype = float_type_node;
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index bb54e56..ea219b7 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -44,6 +44,8 @@
   ENTRY (Poly16x8_t, V8HI, poly, 12)
   ENTRY (Poly64x1_t, DI, poly, 12)
   ENTRY (Poly64x2_t, V2DI, poly, 12)
+  ENTRY (Float16x4_t, V4HF, none, 13)
+  ENTRY (Float16x8_t, V8HF, none, 13)
   ENTRY (Float32x2_t, V2SF, none, 13)
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index dd2bc47..4dd2bc7 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -367,11 +367,11 @@
   VAR1 (UNOP, float_extend_lo_, 0, v2df)
   VAR1 (UNOP, float_truncate_lo_, 0, v2sf)
 
-  /* Implemented by aarch64_ld1<VALL:mode>.  */
-  BUILTIN_VALL (LOAD1, ld1, 0)
+  /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
+  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
 
-  /* Implemented by aarch64_st1<VALL:mode>.  */
-  BUILTIN_VALL (STORE1, st1, 0)
+  /* Implemented by aarch64_st1<VALL_F16:mode>.  */
+  BUILTIN_VALL_F16 (STORE1, st1, 0)
 
   /* Implemented by fma<mode>4.  */
   BUILTIN_VDQF (TERNOP, fma, 4)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b90f938..5cc45ed 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL 0 "nonimmediate_operand" "")
-	(match_operand:VALL 1 "general_operand" ""))]
+  [(set (match_operand:VALL_F16 0 "nonimmediate_operand" "")
+	(match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
     if (GET_CODE (operands[0]) == MEM)
@@ -2450,7 +2450,7 @@
 (define_insn "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
 	(vec_select:<VEL>
-	  (match_operand:VALL 1 "register_operand" "w, w, w")
+	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
@@ -4234,8 +4234,9 @@
 )
 
 (define_insn "aarch64_be_ld1<mode>"
-  [(set (match_operand:VALLDI 0	"register_operand" "=w")
-	(unspec:VALLDI [(match_operand:VALLDI 1 "aarch64_simd_struct_operand" "Utv")]
+  [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
+	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
+			     "aarch64_simd_struct_operand" "Utv")]
 	UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%0<Vmtype>}, %1"
@@ -4243,8 +4244,8 @@
 )
 
 (define_insn "aarch64_be_st1<mode>"
-  [(set (match_operand:VALLDI 0 "aarch64_simd_struct_operand" "=Utv")
-	(unspec:VALLDI [(match_operand:VALLDI 1 "register_operand" "w")]
+  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand" "=Utv")
+	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1 "register_operand" "w")]
 	UNSPEC_ST1))]
   "TARGET_SIMD"
   "st1\\t{%1<Vmtype>}, %0"
@@ -4533,16 +4534,16 @@
   DONE;
 })
 
-(define_expand "aarch64_ld1<VALL:mode>"
- [(match_operand:VALL 0 "register_operand")
+(define_expand "aarch64_ld1<VALL_F16:mode>"
+ [(match_operand:VALL_F16 0 "register_operand")
   (match_operand:DI 1 "register_operand")]
   "TARGET_SIMD"
 {
-  machine_mode mode = <VALL:MODE>mode;
+  machine_mode mode = <VALL_F16:MODE>mode;
   rtx mem = gen_rtx_MEM (mode, operands[1]);
 
   if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_aarch64_be_ld1<VALL:mode> (operands[0], mem));
+    emit_insn (gen_aarch64_be_ld1<VALL_F16:mode> (operands[0], mem));
   else
     emit_move_insn (operands[0], mem);
   DONE;
@@ -4901,16 +4902,16 @@
   DONE;
 })
 
-(define_expand "aarch64_st1<VALL:mode>"
+(define_expand "aarch64_st1<VALL_F16:mode>"
  [(match_operand:DI 0 "register_operand")
-  (match_operand:VALL 1 "register_operand")]
+  (match_operand:VALL_F16 1 "register_operand")]
   "TARGET_SIMD"
 {
-  machine_mode mode = <VALL:MODE>mode;
+  machine_mode mode = <VALL_F16:MODE>mode;
   rtx mem = gen_rtx_MEM (mode, operands[0]);
 
   if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_aarch64_be_st1<VALL:mode> (mem, operands[1]));
+    emit_insn (gen_aarch64_be_st1<VALL_F16:mode> (mem, operands[1]));
   else
     emit_move_insn (mem, operands[1]);
   DONE;
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f338033..ccf063a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1111,6 +1111,9 @@ aarch64_split_simd_move (rtx dst, rtx src)
 	case V2DImode:
 	  gen = gen_aarch64_split_simd_movv2di;
 	  break;
+	case V8HFmode:
+	  gen = gen_aarch64_split_simd_movv8hf;
+	  break;
 	case V4SFmode:
 	  gen = gen_aarch64_split_simd_movv4sf;
 	  break;
@@ -8264,6 +8267,7 @@ aarch64_vector_mode_supported_p (machine_mode mode)
 	  || mode == V2SImode  || mode == V4HImode
 	  || mode == V8QImode || mode == V2SFmode
 	  || mode == V4SFmode || mode == V2DFmode
+	  || mode == V4HFmode || mode == V8HFmode
 	  || mode == V1DFmode))
     return true;
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 44fe4f9..1d09930 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -923,7 +923,8 @@ extern enum aarch64_code_model aarch64_cmodel;
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
-   || (MODE) == V4SFmode || (MODE) == V2DImode || mode == V2DFmode)
+   || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
+   || (MODE) == V2DFmode)
 
 #define ENDIAN_LANE_N(mode, n)  \
   (BYTES_BIG_ENDIAN ? GET_MODE_NUNITS (mode) - 1 - n : n)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 114994e..7425485 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -40,6 +40,7 @@ typedef __Int8x8_t int8x8_t;
 typedef __Int16x4_t int16x4_t;
 typedef __Int32x2_t int32x2_t;
 typedef __Int64x1_t int64x1_t;
+typedef __Float16x4_t float16x4_t;
 typedef __Float32x2_t float32x2_t;
 typedef __Poly8x8_t poly8x8_t;
 typedef __Poly16x4_t poly16x4_t;
@@ -52,6 +53,7 @@ typedef __Int8x16_t int8x16_t;
 typedef __Int16x8_t int16x8_t;
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
+typedef __Float16x8_t float16x8_t;
 typedef __Float32x4_t float32x4_t;
 typedef __Float64x2_t float64x2_t;
 typedef __Poly8x16_t poly8x16_t;
@@ -67,6 +69,7 @@ typedef __Poly16_t poly16_t;
 typedef __Poly64_t poly64_t;
 typedef __Poly128_t poly128_t;
 
+typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
@@ -2691,6 +2694,12 @@ vcreate_p16 (uint64_t __a)
 
 /* vget_lane  */
 
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vget_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vget_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -2765,6 +2774,12 @@ vget_lane_u64 (uint64x1_t __a, const int __b)
 
 /* vgetq_lane  */
 
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vgetq_lane_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vgetq_lane_f32 (float32x4_t __a, const int __b)
 {
@@ -4425,6 +4440,12 @@ vreinterpretq_u32_p16 (poly16x8_t __a)
 
 /* vset_lane  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vset_lane_f16 (float16_t __elem, float16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vset_lane_f32 (float32_t __elem, float32x2_t __vec, const int __index)
 {
@@ -4499,6 +4520,12 @@ vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
 
 /* vsetq_lane  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_f16 (float16_t __elem, float16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vsetq_lane_f32 (float32_t __elem, float32x4_t __vec, const int __index)
 {
@@ -14612,6 +14639,12 @@ vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
 
 /* vld1 */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_f16 (const float16_t *__a)
+{
+  return __builtin_aarch64_ld1v4hf (__a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_f32 (const float32_t *a)
 {
@@ -14691,6 +14724,12 @@ vld1_u64 (const uint64_t *a)
 
 /* vld1q */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_f16 (const float16_t *__a)
+{
+  return __builtin_aarch64_ld1v8hf (__a);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_f32 (const float32_t *a)
 {
@@ -14919,6 +14958,12 @@ vld1q_dup_u64 (const uint64_t* __a)
 
 /* vld1_lane  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
 {
@@ -14993,6 +15038,12 @@ vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
 
 /* vld1q_lane  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
 {
@@ -21960,6 +22011,12 @@ vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 /* vst1 */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f16 (float16_t *__a, float16x4_t __b)
+{
+  __builtin_aarch64_st1v4hf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_f32 (float32_t *a, float32x2_t b)
 {
   __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
@@ -22039,6 +22096,12 @@ vst1_u64 (uint64_t *a, uint64x1_t b)
 /* vst1q */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f16 (float16_t *__a, float16x8_t __b)
+{
+  __builtin_aarch64_st1v8hf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_f32 (float32_t *a, float32x4_t b)
 {
   __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
@@ -22119,6 +22182,12 @@ vst1q_u64 (uint64_t *a, uint64x2_t b)
 /* vst1_lane */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
 {
   *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -22193,6 +22262,12 @@ vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
 /* vst1q_lane */
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
 {
   *__a = __aarch64_vget_lane_any (__b, __lane);
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a6b351b..a7aaa52 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -52,7 +52,7 @@
 (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
 
 ;; Double vector modes.
-(define_mode_iterator VD [V8QI V4HI V2SI V2SF])
+(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
 ;; vector, 64-bit container, all integer modes
 (define_mode_iterator VD_BHSI [V8QI V4HI V2SI])
@@ -61,10 +61,10 @@
 (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
 
 ;; Quad vector modes.
-(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V4SF V2DF])
+(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
 ;; VQ without 2 element modes.
-(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V4SF])
+(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF])
 
 ;; Quad vector with only 2 element modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
@@ -97,12 +97,20 @@
 ;; Vector Float modes with 2 elements.
 (define_mode_iterator V2F [V2SF V2DF])
 
-;; All modes.
+;; All vector modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
-;; All vector modes and DI.
+;; All vector modes, including HF modes on which we cannot operate
+(define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				V4HF V8HF V2SF V4SF V2DF])
+
+;; All vector modes barring F16, plus DI.
 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])
 
+;; All vector modes and DI.
+(define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				  V4HF V8HF V2SF V4SF V2DF DI])
+
 ;; All vector modes and DI and DF.
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
 			       V2DI V2SF V4SF V2DF DI DF])
@@ -364,7 +372,8 @@
 (define_mode_attr Vmtype [(V8QI ".8b") (V16QI ".16b")
 			 (V4HI ".4h") (V8HI  ".8h")
 			 (V2SI ".2s") (V4SI  ".4s")
-			 (V2DI ".2d") (V2SF ".2s")
+			 (V2DI ".2d") (V4HF ".4h")
+			 (V8HF ".8h") (V2SF ".2s")
 			 (V4SF ".4s") (V2DF ".2d")
 			 (DI   "")    (SI   "")
 			 (HI   "")    (QI   "")
@@ -401,6 +410,7 @@
 			(V4HI "HI") (V8HI "HI")
                         (V2SI "SI") (V4SI "SI")
                         (DI "DI")   (V2DI "DI")
+                        (V4HF "HF") (V8HF "HF")
                         (V2SF "SF") (V4SF "SF")
                         (V2DF "DF") (DF "DF")
 			(SI   "SI") (HI   "HI")
@@ -419,6 +429,7 @@
 			 (V4HI "V8HI") (V8HI "V8HI")
 			 (V2SI "V4SI") (V4SI "V4SI")
 			 (DI   "V2DI") (V2DI "V2DI")
+			 (V4HF "V8HF") (V8HF "V8HF")
 			 (V2SF "V2SF") (V4SF "V4SF")
 			 (V2DF "V2DF") (SI   "V4SI")
 			 (HI   "V8HI") (QI   "V16QI")])
@@ -428,10 +439,12 @@
 			 (V4HI "V2HI")  (V8HI  "V4HI")
 			 (V2SI "SI")    (V4SI  "V2SI")
 			 (V2DI "DI")    (V2SF  "SF")
-			 (V4SF "V2SF")  (V2DF  "DF")])
+			 (V4SF "V2SF")  (V4HF "V2HF")
+			 (V8HF "V4HF")  (V2DF  "DF")])
 
 ;; Double modes of vector modes.
 (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
+			(V4HF "V8HF")
 			(V2SI "V4SI")  (V2SF "V4SF")
 			(SI   "V2SI")  (DI   "V2DI")
 			(DF   "V2DF")])
@@ -542,6 +555,7 @@
 (define_mode_attr nregs [(OI "2") (CI "3") (XI "4")])
 
 (define_mode_attr VRL2 [(V8QI "V32QI") (V4HI "V16HI")
+			(V4HF "V16HF")
 			(V2SI "V8SI")  (V2SF "V8SF")
 			(DI   "V4DI")  (DF   "V4DF")
 			(V16QI "V32QI") (V8HI "V16HI")
@@ -549,16 +563,20 @@
 			(V2DI "V4DI")  (V2DF "V4DF")])
 
 (define_mode_attr VRL3 [(V8QI "V48QI") (V4HI "V24HI")
+			(V4HF "V24HF")
 			(V2SI "V12SI")  (V2SF "V12SF")
 			(DI   "V6DI")  (DF   "V6DF")
 			(V16QI "V48QI") (V8HI "V24HI")
+			(V8HF "V48HF")
 			(V4SI "V12SI")  (V4SF "V12SF")
 			(V2DI "V6DI")  (V2DF "V6DF")])
 
 (define_mode_attr VRL4 [(V8QI "V64QI") (V4HI "V32HI")
+			(V4HF "V32HF")
 			(V2SI "V16SI")  (V2SF "V16SF")
 			(DI   "V8DI")  (DF   "V8DF")
 			(V16QI "V64QI") (V8HI "V32HI")
+			(V8HF "V32HF")
 			(V4SI "V16SI")  (V4SF "V16SF")
 			(V2DI "V8DI")  (V2DF "V8DF")])
 
@@ -571,6 +589,7 @@
                               (V2SI "V2SI") (V4SI "V2SI")
                               (DI "V2DI")   (V2DI "V2DI")
                               (V2SF "V2SF") (V4SF "V2SF")
+                              (V4HF "SF") (V8HF "SF")
                               (DF "V2DI")   (V2DF "V2DI")])
 
 ;; Similar, for three elements.
@@ -579,6 +598,7 @@
                                 (V2SI "BLK") (V4SI "BLK")
                                 (DI "EI")    (V2DI "EI")
                                 (V2SF "BLK") (V4SF "BLK")
+                                (V4HF "BLK") (V8HF "BLK")
                                 (DF "EI")    (V2DF "EI")])
 
 ;; Similar, for four elements.
@@ -587,6 +607,7 @@
                                (V2SI "V4SI") (V4SI "V4SI")
                                (DI "OI")     (V2DI "OI")
                                (V2SF "V4SF") (V4SF "V4SF")
+                               (V4HF "V4HF") (V8HF "V4HF")
                                (DF "OI")     (V2DF "OI")])
 
 
@@ -645,6 +666,7 @@
 		     (V4HI "") (V8HI  "_q")
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
+		     (V4HF "") (V8HF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (SF "") (DF "")])
diff --git a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
index 09a20dc..5740c02 100644
--- a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+++ b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
@@ -13,6 +13,7 @@ void f3 (uint8x8_t a) {}
 void f4 (uint16x4_t a) {}
 void f5 (uint32x2_t a) {}
 void f23 (uint64x1_t a) {}
+void f61 (float16x4_t a) {}
 void f6 (float32x2_t a) {}
 void f7 (poly8x8_t a) {}
 void f8 (poly16x4_t a) {}
@@ -25,6 +26,7 @@ void f13 (uint8x16_t a) {}
 void f14 (uint16x8_t a) {}
 void f15 (uint32x4_t a) {}
 void f16 (uint64x2_t a) {}
+void f171 (float16x8_t a) {}
 void f17 (float32x4_t a) {}
 void f18 (float64x2_t a) {}
 void f19 (poly8x16_t a) {}
@@ -42,6 +44,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z2f412__Uint16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f512__Uint32x2_t:" } }
 // { dg-final { scan-assembler "_Z3f2312__Uint64x1_t:" } }
+// { dg-final { scan-assembler "_Z3f6113__Float16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f613__Float32x2_t:" } }
 // { dg-final { scan-assembler "_Z2f711__Poly8x8_t:" } }
 // { dg-final { scan-assembler "_Z2f812__Poly16x4_t:" } }
@@ -53,6 +56,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z3f1412__Uint16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1512__Uint32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1612__Uint64x2_t:" } }
+// { dg-final { scan-assembler "_Z4f17113__Float16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1713__Float32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1813__Float64x2_t:" } }
 // { dg-final { scan-assembler "_Z3f1912__Poly8x16_t:" } }
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c b/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c
index 290444e..fa9ef0f 100644
--- a/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vld1-vst1_1.c
@@ -31,6 +31,7 @@ THING (int8x8_t, 8, int8_t, _s8)	\
 THING (uint8x8_t, 8, uint8_t, _u8)	\
 THING (int16x4_t, 4, int16_t, _s16)	\
 THING (uint16x4_t, 4, uint16_t, _u16)	\
+THING (float16x4_t, 4, float16_t, _f16)	\
 THING (int32x2_t, 2, int32_t, _s32)	\
 THING (uint32x2_t, 2, uint32_t, _u32)	\
 THING (float32x2_t, 2, float32_t, _f32) \
@@ -38,8 +39,10 @@ THING (int8x16_t, 16, int8_t, q_s8)	\
 THING (uint8x16_t, 16, uint8_t, q_u8)	\
 THING (int16x8_t, 8, int16_t, q_s16)	\
 THING (uint16x8_t, 8, uint16_t, q_u16)	\
+THING (float16x8_t, 8, float16_t, q_f16)\
 THING (int32x4_t, 4, int32_t, q_s32)	\
 THING (uint32x4_t, 4, uint32_t, q_u32)	\
+THING (float32x4_t, 4, float32_t, q_f32)\
 THING (int64x2_t, 2, int64_t, q_s64)	\
 THING (uint64x2_t, 2, uint64_t, q_u64)	\
 THING (float64x2_t, 2, float64_t, q_f64)
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1_lane.c b/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
index c2445f8..c70df71 100644
--- a/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
@@ -16,6 +16,7 @@ VARIANT (int32, , 2, _s32, 0)	\
 VARIANT (int64, , 1, _s64, 0)	\
 VARIANT (poly8, , 8, _p8, 7)	\
 VARIANT (poly16, , 4, _p16, 2)	\
+VARIANT (float16, , 4, _f16, 3)	\
 VARIANT (float32, , 2, _f32, 1)	\
 VARIANT (float64, , 1, _f64, 0)	\
 VARIANT (uint8, q, 16, _u8, 13)	\
@@ -28,6 +29,7 @@ VARIANT (int32, q, 4, _s32, 1)	\
 VARIANT (int64, q, 2, _s64, 1)	\
 VARIANT (poly8, q, 16, _p8, 7)	\
 VARIANT (poly16, q, 8, _p16, 4)	\
+VARIANT (float16, q, 8, _f16, 3)\
 VARIANT (float32, q, 4, _f32, 2)\
 VARIANT (float64, q, 2, _f64, 1)
 
@@ -56,7 +58,7 @@ VARIANTS (TESTMETH)
 
 #define CHECK(BASE, Q, ELTS, SUFFIX, LANE)			\
   if (test_vld1##Q##_lane##SUFFIX ((const BASE##_t *)orig_data,	\
-				   BASE##_data) != 0)	\
+				   & BASE##_data) != 0)	\
     abort ();
 
 int
@@ -65,20 +67,20 @@ main (int argc, char **argv)
   /* Original data for all vector formats.  */
   uint64_t orig_data[2] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL};
 
-  /* Data with which vldN_lane will overwrite some of previous.  */
-  uint8_t uint8_data[4] = { 7, 11, 13, 17 };
-  uint16_t uint16_data[4] = { 257, 263, 269, 271 };
-  uint32_t uint32_data[4] = { 65537, 65539, 65543, 65551 };
-  uint64_t uint64_data[4] = { 0xdeadbeefcafebabeULL, 0x0123456789abcdefULL,
-			      0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
-  int8_t int8_data[4] = { -1, 3, -5, 7 };
-  int16_t int16_data[4] = { 257, -259, 261, -263 };
-  int32_t int32_data[4] = { 123456789, -987654321, -135792468, 975318642 };
-  int64_t *int64_data = (int64_t *)uint64_data;
-  poly8_t poly8_data[4] = { 0, 7, 13, 18, };
-  poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
-  float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
-  float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
+  /* Data with which vld1_lane will overwrite one element of previous.  */
+  uint8_t uint8_data = 7;
+  uint16_t uint16_data = 257;
+  uint32_t uint32_data = 65537;
+  uint64_t uint64_data = 0xdeadbeefcafebabeULL;
+  int8_t int8_data = -1;
+  int16_t int16_data = -259;
+  int32_t int32_data = -987654321;
+  int64_t int64_data = 0x1234567890abcdefLL;
+  poly8_t poly8_data = 13;
+  poly16_t poly16_data = 11111;
+  float16_t float16_data = 8.75;
+  float32_t float32_data = 3.14159;
+  float64_t float64_data = 1.010010001;
 
   VARIANTS (CHECK);
   return 0;
diff --git a/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c
index 5fb1139..bc0132c 100644
--- a/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vset_lane_1.c
@@ -16,6 +16,7 @@ VARIANT (int32_t, , 2, int32x2_t, _s32, 0)	\
 VARIANT (int64_t, , 1, int64x1_t, _s64, 0)	\
 VARIANT (poly8_t, , 8, poly8x8_t, _p8, 6)	\
 VARIANT (poly16_t, , 4, poly16x4_t, _p16, 2)	\
+VARIANT (float16_t, , 4, float16x4_t, _f16, 3)	\
 VARIANT (float32_t, , 2, float32x2_t, _f32, 1)	\
 VARIANT (float64_t, , 1, float64x1_t, _f64, 0)	\
 VARIANT (uint8_t, q, 16, uint8x16_t, _u8, 11)	\
@@ -28,6 +29,7 @@ VARIANT (int32_t, q, 4, int32x4_t, _s32, 3)	\
 VARIANT (int64_t, q, 2, int64x2_t, _s64, 0)	\
 VARIANT (poly8_t, q, 16, poly8x16_t, _p8, 14)	\
 VARIANT (poly16_t, q, 8, poly16x8_t, _p16, 6)	\
+VARIANT (float16_t, q, 8, float16x8_t, _f16, 6)	\
 VARIANT (float32_t, q, 4, float32x4_t, _f32, 2) \
 VARIANT (float64_t, q, 2, float64x2_t, _f64, 1)
 
@@ -76,6 +78,9 @@ main (int argc, char **argv)
   poly8_t poly8_t_data[16] =
       { 0, 7, 13, 18, 22, 25, 27, 28, 29, 31, 34, 38, 43, 49, 56, 64 };
   poly16_t poly16_t_data[8] = { 11111, 2222, 333, 44, 5, 65432, 54321, 43210 };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
+
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (11 preceding siblings ...)
  2015-07-07 12:37 ` [PATCH 12/16][AArch64] vreinterpret(q?), vget_(low|high), vld1(q?)_dup Alan Lawrence
@ 2015-07-07 12:37 ` Alan Lawrence
  2015-07-07 22:06   ` Jeff Law
  2015-07-07 12:37 ` [PATCH 13/16][AArch64] Add vcvt(_high)?_f32_f16 intrinsics, with BE RTL fix Alan Lawrence
                   ` (2 subsequent siblings)
  15 siblings, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 155 bytes --]

As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01346.html. Fixes FAIL of 
advsimd-intrinsics vcreate.c on aarch64_be-none-elf from previous patch.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 15_native_interpret_real.patch --]
[-- Type: text/x-patch; name=15_native_interpret_real.patch, Size: 787 bytes --]

commit e2e7ca148960a82fc88128820f17e7cbd14173cb
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Thu Apr 9 10:54:40 2015 +0100

    Fix native_interpret_real for HFmode floats on Bigendian with UNITS_PER_WORD>=4
    
    (with missing space)

diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index e61d946..15a10f0 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -7622,7 +7622,7 @@ native_interpret_real (tree type, const unsigned char *ptr, int len)
 	    offset += byte % UNITS_PER_WORD;
 	}
       else
-	offset = BYTES_BIG_ENDIAN ? 3 - byte : byte;
+	offset = BYTES_BIG_ENDIAN ? MIN (3, total_bytes - 1) - byte : byte;
       value = ptr[offset + ((bitpos / BITS_PER_UNIT) & ~3)];
 
       tmp[bitpos / 32] |= (unsigned long)value << (bitpos & 31);

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 12/16][AArch64] vreinterpret(q?), vget_(low|high), vld1(q?)_dup
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (10 preceding siblings ...)
  2015-07-07 12:36 ` [PATCH 11/16][AArch64] Implement vcvt_{,high_}f16_f32 Alan Lawrence
@ 2015-07-07 12:37 ` Alan Lawrence
  2015-07-07 12:37 ` [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real Alan Lawrence
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:37 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1655 bytes --]

This is the remainder of 
https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01343.html combined with 
https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01344.html, putting together all 
the intrinsics that didn't require anything outside arm_neon.h. Also update the 
existing tests in aarch64/.

gcc/ChangeLog:

	* config/aarch64/arm_neon.h (vreinterpret_p8_f16, vreinterpret_p16_f16,
	vreinterpret_f16_f64, vreinterpret_f16_s8, vreinterpret_f16_s16,
	vreinterpret_f16_s32, vreinterpret_f16_s64, vreinterpret_f16_f32,
	vreinterpret_f16_u8, vreinterpret_f16_u16, vreinterpret_f16_u32,
	vreinterpret_f16_u64, vreinterpret_f16_p8, vreinterpret_f16_p16,
	vreinterpretq_f16_f64, vreinterpretq_f16_s8, vreinterpretq_f16_s16,
	vreinterpretq_f16_s32, vreinterpretq_f16_s64, vreinterpretq_f16_f32,
	vreinterpretq_f16_u8, vreinterpretq_f16_u16, vreinterpretq_f16_u32,
	vreinterpretq_f16_u64, vreinterpretq_f16_p8, vreinterpretq_f16_p16,
	vreinterpret_f32_f16, vreinterpret_f64_f16, vreinterpret_s64_f16,
	vreinterpret_u64_f16, vreinterpretq_u64_f16, vreinterpret_s8_f16,
	vreinterpret_s16_f16, vreinterpret_s32_f16, vreinterpret_u8_f16,
	vreinterpret_u16_f16, vreinterpret_u32_f16, vreinterpretq_p8_f16,
	vreinterpretq_p16_f16, vreinterpretq_f32_f16, vreinterpretq_f64_f16,
	vreinterpretq_s64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16,
	vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16,
	vreinterpretq_u32_f16, vget_low_f16, vget_high_f16, vld1_dup_f16,
	vld1q_dup_f16): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vget_high_1.c: Add float16x8->float16x4 case.
	* gcc.target/aarch64/vget_low_1.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 12_aarch64_reinterpret.patch --]
[-- Type: text/x-patch; name=12_aarch64_reinterpret.patch, Size: 17527 bytes --]

commit beb21a6bce76d4fbedb13fcf25796563b27f6bae
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Mon Jun 29 18:46:49 2015 +0100

    [AArch64 5/N v2] vreinterpret, vget_(low|high), vld1(q?)_dup. update tests for vget_low/high

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index b915754..ff1a45c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -2891,6 +2891,12 @@ vgetq_lane_u64 (uint64x2_t __a, const int __b)
 /* vreinterpret  */
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_f16 (float16x4_t __a)
+{
+  return (poly8x8_t) __a;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vreinterpret_p8_f64 (float64x1_t __a)
 {
   return (poly8x8_t) __a;
@@ -2987,6 +2993,12 @@ vreinterpretq_p8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f16 (float16x8_t __a)
+{
+  return (poly8x16_t) __a;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_p8_f32 (float32x4_t __a)
 {
   return (poly8x16_t) __a;
@@ -3023,6 +3035,12 @@ vreinterpretq_p8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_f16 (float16x4_t __a)
+{
+  return (poly16x4_t) __a;
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vreinterpret_p16_f64 (float64x1_t __a)
 {
   return (poly16x4_t) __a;
@@ -3119,6 +3137,12 @@ vreinterpretq_p16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_f16 (float16x8_t __a)
+{
+  return (poly16x8_t) __a;
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_p16_f32 (float32x4_t __a)
 {
   return (poly16x8_t) __a;
@@ -3154,6 +3178,156 @@ vreinterpretq_p16_p8 (poly8x16_t __a)
   return (poly16x8_t) __a;
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_f64 (float64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s8 (int8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s16 (int16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s32 (int32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s64 (int64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_f32 (float32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u8 (uint8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u16 (uint16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u32 (uint32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u64 (uint64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p8 (poly8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p16 (poly16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_f64 (float64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s8 (int8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s16 (int16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s32 (int32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s64 (int64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_f32 (float32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u8 (uint8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u16 (uint16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u32 (uint32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u64 (uint64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p8 (poly8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p16 (poly16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_f16 (float16x4_t __a)
+{
+  return (float32x2_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vreinterpret_f32_f64 (float64x1_t __a)
 {
@@ -3221,6 +3395,12 @@ vreinterpret_f32_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_f16 (float16x8_t __a)
+{
+  return (float32x4_t) __a;
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_f64 (float64x2_t __a)
 {
   return (float32x4_t) __a;
@@ -3287,6 +3467,12 @@ vreinterpretq_f32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline float64x1_t __attribute__((__always_inline__))
+vreinterpret_f64_f16 (float16x4_t __a)
+{
+  return (float64x1_t) __a;
+}
+
+__extension__ static __inline float64x1_t __attribute__((__always_inline__))
 vreinterpret_f64_f32 (float32x2_t __a)
 {
   return (float64x1_t) __a;
@@ -3353,6 +3539,12 @@ vreinterpret_f64_u64 (uint64x1_t __a)
 }
 
 __extension__ static __inline float64x2_t __attribute__((__always_inline__))
+vreinterpretq_f64_f16 (float16x8_t __a)
+{
+  return (float64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__((__always_inline__))
 vreinterpretq_f64_f32 (float32x4_t __a)
 {
   return (float64x2_t) __a;
@@ -3419,6 +3611,12 @@ vreinterpretq_f64_u64 (uint64x2_t __a)
 }
 
 __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_f16 (float16x4_t __a)
+{
+  return (int64x1_t) __a;
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vreinterpret_s64_f64 (float64x1_t __a)
 {
   return (int64x1_t) __a;
@@ -3509,6 +3707,12 @@ vreinterpretq_s64_s32 (int32x4_t __a)
 }
 
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f16 (float16x8_t __a)
+{
+  return (int64x2_t) __a;
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_s64_f32 (float32x4_t __a)
 {
   return (int64x2_t) __a;
@@ -3551,6 +3755,12 @@ vreinterpretq_s64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_f16 (float16x4_t __a)
+{
+  return (uint64x1_t) __a;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vreinterpret_u64_f64 (float64x1_t __a)
 {
   return (uint64x1_t) __a;
@@ -3647,6 +3857,12 @@ vreinterpretq_u64_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f16 (float16x8_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_u64_f32 (float32x4_t __a)
 {
   return (uint64x2_t) __a;
@@ -3683,6 +3899,12 @@ vreinterpretq_u64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_f16 (float16x4_t __a)
+{
+  return (int8x8_t) __a;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vreinterpret_s8_f64 (float64x1_t __a)
 {
   return (int8x8_t) __a;
@@ -3773,6 +3995,12 @@ vreinterpretq_s8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f16 (float16x8_t __a)
+{
+  return (int8x16_t) __a;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_s8_f32 (float32x4_t __a)
 {
   return (int8x16_t) __a;
@@ -3815,6 +4043,12 @@ vreinterpretq_s8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_f16 (float16x4_t __a)
+{
+  return (int16x4_t) __a;
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vreinterpret_s16_f64 (float64x1_t __a)
 {
   return (int16x4_t) __a;
@@ -3905,6 +4139,12 @@ vreinterpretq_s16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f16 (float16x8_t __a)
+{
+  return (int16x8_t) __a;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_s16_f32 (float32x4_t __a)
 {
   return (int16x8_t) __a;
@@ -3947,6 +4187,12 @@ vreinterpretq_s16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_f16 (float16x4_t __a)
+{
+  return (int32x2_t) __a;
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vreinterpret_s32_f64 (float64x1_t __a)
 {
   return (int32x2_t) __a;
@@ -4037,6 +4283,12 @@ vreinterpretq_s32_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f16 (float16x8_t __a)
+{
+  return (int32x4_t) __a;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t) __a;
@@ -4079,6 +4331,12 @@ vreinterpretq_s32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_f16 (float16x4_t __a)
+{
+  return (uint8x8_t) __a;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vreinterpret_u8_f64 (float64x1_t __a)
 {
   return (uint8x8_t) __a;
@@ -4175,6 +4433,12 @@ vreinterpretq_u8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f16 (float16x8_t __a)
+{
+  return (uint8x16_t) __a;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_u8_f32 (float32x4_t __a)
 {
   return (uint8x16_t) __a;
@@ -4211,6 +4475,12 @@ vreinterpretq_u8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t) __a;
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vreinterpret_u16_f64 (float64x1_t __a)
 {
   return (uint16x4_t) __a;
@@ -4307,6 +4577,12 @@ vreinterpretq_u16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t) __a;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_u16_f32 (float32x4_t __a)
 {
   return (uint16x8_t) __a;
@@ -4343,6 +4619,12 @@ vreinterpretq_u16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_f16 (float16x4_t __a)
+{
+  return (uint32x2_t) __a;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vreinterpret_u32_f64 (float64x1_t __a)
 {
   return (uint32x2_t) __a;
@@ -4439,6 +4721,12 @@ vreinterpretq_u32_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f16 (float16x8_t __a)
+{
+  return (uint32x4_t) __a;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t) __a;
@@ -4639,6 +4927,12 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
   uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
   return vreinterpret_##__TYPE##_u64 (lo);
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_low_f16 (float16x8_t __a)
+{
+  __GET_LOW (f16);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_low_f32 (float32x4_t __a)
 {
@@ -4718,6 +5012,12 @@ vget_low_u64 (uint64x2_t __a)
   uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
   return vreinterpret_##__TYPE##_u64 (hi);
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_high_f16 (float16x8_t __a)
+{
+  __GET_HIGH (f16);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_high_f32 (float32x4_t __a)
 {
@@ -14871,6 +15171,13 @@ vld1q_u64 (const uint64_t *a)
 
 /* vld1_dup  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_dup_f16 (const float16_t* __a)
+{
+  float16_t __f = *__a;
+  return (float16x4_t) { __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_dup_f32 (const float32_t* __a)
 {
@@ -14945,6 +15252,13 @@ vld1_dup_u64 (const uint64_t* __a)
 
 /* vld1q_dup  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_f16 (const float16_t* __a)
+{
+  float16_t __f = *__a;
+  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t* __a)
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_high_1.c b/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
index 4cb872d..b6b57e0 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
@@ -14,6 +14,7 @@ VARIANT (int8_t, 8, int8x8_t, int8x16_t, s8)		\
 VARIANT (int16_t, 4, int16x4_t, int16x8_t, s16)		\
 VARIANT (int32_t, 2, int32x2_t, int32x4_t, s32)		\
 VARIANT (int64_t, 1, int64x1_t, int64x2_t, s64)		\
+VARIANT (float16_t, 4, float16x4_t, float16x8_t, f16)	\
 VARIANT (float32_t, 2, float32x2_t, float32x4_t, f32)	\
 VARIANT (float64_t, 1, float64x1_t, float64x2_t, f64)
 
@@ -51,6 +52,8 @@ main (int argc, char **argv)
   int16_t int16_t_data[8] = { -17, 19, 3, -999, 44048, 505, 9999, 1000};
   int32_t int32_t_data[4] = { 123456789, -987654321, -135792468, 975318642 };
   int64_t int64_t_data[2] = {0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_1.c b/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
index f8016ef..2223676 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
@@ -14,6 +14,7 @@ VARIANT (int8_t, 8, int8x8_t, int8x16_t, s8)		\
 VARIANT (int16_t, 4, int16x4_t, int16x8_t, s16)		\
 VARIANT (int32_t, 2, int32x2_t, int32x4_t, s32)		\
 VARIANT (int64_t, 1, int64x1_t, int64x2_t, s64)		\
+VARIANT (float16_t, 4, float16x4_t, float16x8_t, f16)	\
 VARIANT (float32_t, 2, float32x2_t, float32x4_t, f32)	\
 VARIANT (float64_t, 1, float64x1_t, float64x2_t, f64)
 
@@ -51,6 +52,8 @@ main (int argc, char **argv)
   int16_t int16_t_data[8] = { -17, 19, 3, -999, 44048, 505, 9999, 1000};
   int32_t int32_t_data[4] = { 123456789, -987654321, -135792468, 975318642 };
   int64_t int64_t_data[2] = {0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 13/16][AArch64] Add vcvt(_high)?_f32_f16 intrinsics, with BE RTL fix
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (12 preceding siblings ...)
  2015-07-07 12:37 ` [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real Alan Lawrence
@ 2015-07-07 12:37 ` Alan Lawrence
  2015-07-07 12:38 ` [PATCH 14/16][ARM/AArch64 testsuite] Update advsimd-intrinsics tests to add float16 vectors Alan Lawrence
  2015-07-07 12:39 ` [PATCH 16/16][ARM/AArch64 Testsuite] Add test of vcvt{,_high}_{f16_f32,f32_f16} Alan Lawrence
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:37 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 74 bytes --]

Unchanged since https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01345.html

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 13_aarch64_vcvt_f32_f16.patch --]
[-- Type: text/x-patch; name=13_aarch64_vcvt_f32_f16.patch, Size: 6983 bytes --]

commit 214fcc00475a543a79ed444f9a64061215397cc8
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Wed Jan 28 13:01:31 2015 +0000

    AArch64 6/N: vcvt{,_high}_f32_f16 (using vect_par_cnst_hi_half, fixing bigendian indices)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 8bcab72..9869b73 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -361,11 +361,11 @@
   BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
   BUILTIN_VDQF (UNOP, abs, 2)
 
-  VAR1 (UNOP, vec_unpacks_hi_, 10, v4sf)
+  VAR2 (UNOP, vec_unpacks_hi_, 10, v4sf, v8hf)
   VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
   VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
 
-  VAR1 (UNOP, float_extend_lo_, 0, v2df)
+  VAR2 (UNOP, float_extend_lo_, 0, v2df, v4sf)
   BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2dc54e1..1a7d858 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1691,36 +1691,57 @@
 
 ;; Float widening operations.
 
-(define_insn "vec_unpacks_lo_v4sf"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-	(float_extend:V2DF
-	  (vec_select:V2SF
-	    (match_operand:V4SF 1 "register_operand" "w")
-	    (parallel [(const_int 0) (const_int 1)])
-	  )))]
+(define_insn "aarch64_simd_vec_unpacks_lo_<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (float_extend:<VWIDE> (vec_select:<VHALF>
+			       (match_operand:VQ_HSF 1 "register_operand" "w")
+			       (match_operand:VQ_HSF 2 "vect_par_cnst_lo_half" "")
+			    )))]
   "TARGET_SIMD"
-  "fcvtl\\t%0.2d, %1.2s"
+  "fcvtl\\t%0.<Vwtype>, %1.<Vhalftype>"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
-(define_insn "aarch64_float_extend_lo_v2df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-	(float_extend:V2DF
-	  (match_operand:V2SF 1 "register_operand" "w")))]
+(define_expand "vec_unpacks_lo_<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (match_operand:VQ_HSF 1 "register_operand" "")]
   "TARGET_SIMD"
-  "fcvtl\\t%0.2d, %1.2s"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
+						       operands[1], p));
+    DONE;
+  }
+)
+
+(define_insn "aarch64_simd_vec_unpacks_hi_<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (float_extend:<VWIDE> (vec_select:<VHALF>
+			       (match_operand:VQ_HSF 1 "register_operand" "w")
+			       (match_operand:VQ_HSF 2 "vect_par_cnst_hi_half" "")
+			    )))]
+  "TARGET_SIMD"
+  "fcvtl2\\t%0.<Vwtype>, %1.<Vtype>"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
-(define_insn "vec_unpacks_hi_v4sf"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-	(float_extend:V2DF
-	  (vec_select:V2SF
-	    (match_operand:V4SF 1 "register_operand" "w")
-	    (parallel [(const_int 2) (const_int 3)])
-	  )))]
+(define_expand "vec_unpacks_hi_<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (match_operand:VQ_HSF 1 "register_operand" "")]
+  "TARGET_SIMD"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+    emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
+						       operands[1], p));
+    DONE;
+  }
+)
+(define_insn "aarch64_float_extend_lo_<Vwide>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(float_extend:<VWIDE>
+	  (match_operand:VDF 1 "register_operand" "w")))]
   "TARGET_SIMD"
-  "fcvtl2\\t%0.2d, %1.4s"
+  "fcvtl\\t%0<Vmwtype>, %1<Vmtype>"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ff1a45c..4f0636f 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6026,10 +6026,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-/* vcvt_f32_f16 not supported */
-
-/* vcvt_high_f32_f16 not supported */
-
 #define vcvt_n_f32_s32(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
@@ -13420,6 +13416,12 @@ vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
 
 /* vcvt (float -> double).  */
 
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvt_f32_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_float_extend_lo_v4sf (__a);
+}
+
 __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vcvt_f64_f32 (float32x2_t __a)
 {
@@ -13427,6 +13429,12 @@ vcvt_f64_f32 (float32x2_t __a)
   return __builtin_aarch64_float_extend_lo_v2df (__a);
 }
 
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvt_high_f32_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
+}
+
 __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vcvt_high_f64_f32 (float32x4_t __a)
 {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f6094b1..32658ab 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -91,6 +91,9 @@
 ;; Vector single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
 
+;; Quad vector Float modes with half/single elements.
+(define_mode_iterator VQ_HSF [V8HF V4SF])
+
 ;; Modes suitable to use as the return type of a vcond expression.
 (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
 
@@ -492,14 +495,18 @@
 			 (V2SI "V2DI") (V16QI "V8HI") 
 			 (V8HI "V4SI") (V4SI "V2DI")
 			 (HI "SI")     (SI "DI")
+			 (V8HF "V4SF") (V4SF "V2DF")
 			 (V4HF "V4SF") (V2SF "V2DF")]
-
 )
 
-;; Widened mode register suffixes for VD_BHSI/VQW.
+;; Widened modes of vector modes, lowercase
+(define_mode_attr Vwide [(V2SF "v2df") (V4HF "v4sf")])
+
+;; Widened mode register suffixes for VD_BHSI/VQW/VQ_HSF.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
-			  (V8HI "4s") (V4SI "2d")])
+			  (V8HI "4s") (V4SI "2d")
+			  (V8HF "4s") (V4SF "2d")])
 
 ;; Widened mode register suffixes for VDW/VQW.
 (define_mode_attr Vmwtype [(V8QI ".8h") (V4HI ".4s")
@@ -508,9 +515,10 @@
 			   (V4HF ".4s") (V2SF ".2d")
 			   (SI   "")    (HI   "")])
 
-;; Lower part register suffixes for VQW.
+;; Lower part register suffixes for VQW/VQ_HSF.
 (define_mode_attr Vhalftype [(V16QI "8b") (V8HI "4h")
-			     (V4SI "2s")])
+			     (V4SI "2s") (V8HF "4h")
+			     (V4SF "2s")])
 
 ;; Define corresponding core/FP element mode for each vector mode.
 (define_mode_attr vw   [(V8QI "w") (V16QI "w")

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 14/16][ARM/AArch64 testsuite] Update advsimd-intrinsics tests to add float16 vectors
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (13 preceding siblings ...)
  2015-07-07 12:37 ` [PATCH 13/16][AArch64] Add vcvt(_high)?_f32_f16 intrinsics, with BE RTL fix Alan Lawrence
@ 2015-07-07 12:38 ` Alan Lawrence
  2015-07-07 12:39 ` [PATCH 16/16][ARM/AArch64 Testsuite] Add test of vcvt{,_high}_{f16_f32,f32_f16} Alan Lawrence
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:38 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2032 bytes --]

This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01347.html, 
removing many default values of 0x333, to complete that I introduced new macros 
CHECK_RESULTS{,_NAMED}_NO_FP16 as writing the same list of vector types in four 
places seemed too many.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h (hfloat16_t,
	vdup_n_f16, CHECK_RESULTS_NO_FP16, CHECK_RESULTS_NAMED_NO_FP16): New.
	(result, expected, clean_results): Add float16x4 and float16x8 cases.
	(CHECK_RESULTS_NAMED): Likewise, using CHECK_RESULTS_NAMED_NO_FP16.
	(CHECK_RESULTS): Redefine using CHECK_RESULTS_NAMED

	DECL_VARIABLE_64BITS_VARIANTS: Add float16x4 case.
	DECL_VARIABLE_128BITS_VARIANTS: Add float16x8 case.

	* gcc.target/aarch64/advsimd-intrinsics/compute-data-ref.h (buffer,
	buffer_pad, buffer_dup, buffer_dup_pad): Add float16x4 and float16x8.

	* gcc.target/aarch64/advsimd-intrinsics/vbsl.c (exec_vbsl): Change
	CHECK_RESULTS to CHECK_RESULTS_NO_FP16.
	* gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c (exec_vdup_lane):
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vext.c (exec_vext): Likewise.

	* gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c (exec_vdup_vmov):
	Change CHECK_RESULTS_NAMED to CHECK_RESULTS_NAMED_NO_FP16.

	* gcc.target/aarch64/advsimd-intrinsics/vcombine.c: Add expected
	results for float16x4 and float16x8.
	(exec_vcombine): add test of float16x4 -> float16x8 case.
	* gcc.target/aarch64/advsimd-intrinsics/vcreate.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vget_high.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vget_low.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vld1.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vldX.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 14_advsimd_intrinsics.patch --]
[-- Type: text/x-patch; name=14_advsimd_intrinsics.patch, Size: 62781 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 4e728d5..cf9c358 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -7,6 +7,7 @@
 #include <inttypes.h>
 
 /* helper type, to help write floating point results in integer form.  */
+typedef uint16_t hfloat16_t;
 typedef uint32_t hfloat32_t;
 typedef uint64_t hfloat64_t;
 
@@ -132,6 +133,7 @@ static ARRAY(result, uint, 32, 2);
 static ARRAY(result, uint, 64, 1);
 static ARRAY(result, poly, 8, 8);
 static ARRAY(result, poly, 16, 4);
+static ARRAY(result, float, 16, 4);
 static ARRAY(result, float, 32, 2);
 static ARRAY(result, int, 8, 16);
 static ARRAY(result, int, 16, 8);
@@ -143,6 +145,7 @@ static ARRAY(result, uint, 32, 4);
 static ARRAY(result, uint, 64, 2);
 static ARRAY(result, poly, 8, 16);
 static ARRAY(result, poly, 16, 8);
+static ARRAY(result, float, 16, 8);
 static ARRAY(result, float, 32, 4);
 #ifdef __aarch64__
 static ARRAY(result, float, 64, 2);
@@ -160,6 +163,7 @@ extern ARRAY(expected, uint, 32, 2);
 extern ARRAY(expected, uint, 64, 1);
 extern ARRAY(expected, poly, 8, 8);
 extern ARRAY(expected, poly, 16, 4);
+extern ARRAY(expected, hfloat, 16, 4);
 extern ARRAY(expected, hfloat, 32, 2);
 extern ARRAY(expected, int, 8, 16);
 extern ARRAY(expected, int, 16, 8);
@@ -171,38 +175,11 @@ extern ARRAY(expected, uint, 32, 4);
 extern ARRAY(expected, uint, 64, 2);
 extern ARRAY(expected, poly, 8, 16);
 extern ARRAY(expected, poly, 16, 8);
+extern ARRAY(expected, hfloat, 16, 8);
 extern ARRAY(expected, hfloat, 32, 4);
 extern ARRAY(expected, hfloat, 64, 2);
 
-/* Check results. Operates on all possible vector types.  */
-#define CHECK_RESULTS(test_name,comment)				\
-  {									\
-    CHECK(test_name, int, 8, 8, PRIx8, expected, comment);		\
-    CHECK(test_name, int, 16, 4, PRIx16, expected, comment);		\
-    CHECK(test_name, int, 32, 2, PRIx32, expected, comment);		\
-    CHECK(test_name, int, 64, 1, PRIx64, expected, comment);		\
-    CHECK(test_name, uint, 8, 8, PRIx8, expected, comment);		\
-    CHECK(test_name, uint, 16, 4, PRIx16, expected, comment);		\
-    CHECK(test_name, uint, 32, 2, PRIx32, expected, comment);		\
-    CHECK(test_name, uint, 64, 1, PRIx64, expected, comment);		\
-    CHECK(test_name, poly, 8, 8, PRIx8, expected, comment);		\
-    CHECK(test_name, poly, 16, 4, PRIx16, expected, comment);		\
-    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
-									\
-    CHECK(test_name, int, 8, 16, PRIx8, expected, comment);		\
-    CHECK(test_name, int, 16, 8, PRIx16, expected, comment);		\
-    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);		\
-    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);		\
-    CHECK(test_name, uint, 8, 16, PRIx8, expected, comment);		\
-    CHECK(test_name, uint, 16, 8, PRIx16, expected, comment);		\
-    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);		\
-    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);		\
-    CHECK(test_name, poly, 8, 16, PRIx8, expected, comment);		\
-    CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);		\
-    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
-  }									\
-
-#define CHECK_RESULTS_NAMED(test_name,EXPECTED,comment)			\
+#define CHECK_RESULTS_NAMED_NO_FP16(test_name,EXPECTED,comment)		\
   {									\
     CHECK(test_name, int, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, int, 16, 4, PRIx16, EXPECTED, comment);		\
@@ -229,6 +206,19 @@ extern ARRAY(expected, hfloat, 64, 2);
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
   }									\
 
+/* Check results against EXPECTED. Operates on all possible vector types.  */
+#define CHECK_RESULTS_NAMED(test_name,EXPECTED,comment)			\
+  {									\
+    CHECK_RESULTS_NAMED_NO_FP16(test_name, EXPECTED, comment)		\
+    CHECK_FP(test_name, float, 16, 4, PRIx16, EXPECTED, comment);	\
+    CHECK_FP(test_name, float, 16, 8, PRIx16, EXPECTED, comment);	\
+  }									\
+
+#define CHECK_RESULTS_NO_FP16(test_name,comment)	\
+  CHECK_RESULTS_NAMED_NO_FP16(test_name, expected, comment)
+
+#define CHECK_RESULTS(test_name,comment)		\
+  CHECK_RESULTS_NAMED(test_name, expected, comment)
 
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -380,6 +370,7 @@ static void clean_results (void)
   CLEAN(result, uint, 64, 1);
   CLEAN(result, poly, 8, 8);
   CLEAN(result, poly, 16, 4);
+  CLEAN(result, float, 16, 4);
   CLEAN(result, float, 32, 2);
 
   CLEAN(result, int, 8, 16);
@@ -392,6 +383,7 @@ static void clean_results (void)
   CLEAN(result, uint, 64, 2);
   CLEAN(result, poly, 8, 16);
   CLEAN(result, poly, 16, 8);
+  CLEAN(result, float, 16, 8);
   CLEAN(result, float, 32, 4);
 
 #if defined(__aarch64__)
@@ -448,6 +440,7 @@ static void clean_results (void)
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
+  DECL_VARIABLE(VAR, float, 16, 4);		\
   DECL_VARIABLE(VAR, float, 32, 2)
 
 /* Declare all 128 bits variants.  */
@@ -456,6 +449,7 @@ static void clean_results (void)
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
+  DECL_VARIABLE(VAR, float, 16, 8);		\
   DECL_VARIABLE(VAR, float, 32, 4)
 
 /* Declare all variants.  */
@@ -476,6 +470,13 @@ static void clean_results (void)
 /* Helpers to initialize vectors.  */
 #define VDUP(VAR, Q, T1, T2, W, N, V)			\
   VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
+/* Work around that there is no vdup_n_f16 intrinsic.  */
+#define vdup_n_f16(VAL)		\
+  __extension__			\
+    ({				\
+      float16_t f = VAL;	\
+      vld1_dup_f16(&f);		\
+    })
 
 #define VSET_LANE(VAR, Q, T1, T2, W, N, L, V)				\
   VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
index 26203cc..6365579 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
@@ -118,6 +118,8 @@ VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
 PAD(buffer_pad, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
 PAD(buffer_pad, uint, 64, 1);
+VECT_VAR_DECL_INIT(buffer, float, 16, 4);
+PAD(buffer_pad, float, 16, 4);
 VECT_VAR_DECL_INIT(buffer, float, 32, 2);
 PAD(buffer_pad, float, 32, 2);
 VECT_VAR_DECL_INIT(buffer, int, 8, 16);
@@ -140,6 +142,8 @@ VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
 PAD(buffer_pad, poly, 8, 16);
 VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
 PAD(buffer_pad, poly, 16, 8);
+VECT_VAR_DECL_INIT(buffer, float, 16, 8);
+PAD(buffer_pad, float, 16, 8);
 VECT_VAR_DECL_INIT(buffer, float, 32, 4);
 PAD(buffer_pad, float, 32, 4);
 #ifdef __aarch64__
@@ -170,6 +174,8 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
+VECT_VAR_DECL_INIT4(buffer_dup, float, 16, 4);
+VECT_VAR_DECL(buffer_dup_pad, float, 16, 4);
 VECT_VAR_DECL_INIT4(buffer_dup, float, 32, 2);
 VECT_VAR_DECL(buffer_dup_pad, float, 32, 2);
 
@@ -193,5 +199,7 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
+VECT_VAR_DECL_INIT(buffer_dup, float, 16, 8);
+VECT_VAR_DECL(buffer_dup_pad, float, 16, 8);
 VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4);
 VECT_VAR_DECL(buffer_dup_pad, float, 32, 4);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
index bb17f0a..c4fdbb4 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
@@ -114,7 +114,7 @@ void exec_vbsl (void)
   TEST_VBSL(uint, , float, f, 32, 2);
   TEST_VBSL(uint, q, float, f, 32, 4);
 
-  CHECK_RESULTS (TEST_MSG, "");
+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
 }
 
 int main (void)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
index 295768a..aee406d 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
@@ -27,6 +27,8 @@ VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0x66, 0x66, 0x66, 0x66 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0x40533333, 0x40533333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+					   0x4080, 0x4080, 0x4080, 0x4080 };
 
 #define TEST_MSG "VCOMBINE"
 void exec_vcombine (void)
@@ -44,6 +46,7 @@ void exec_vcombine (void)
 
   /* Initialize input "vector64_a" from "buffer".  */
   TEST_MACRO_64BITS_VARIANTS_2_5(VLOAD, vector64_a, buffer);
+  VLOAD(vector64_a, buffer, , float, f, 16, 4);
   VLOAD(vector64_a, buffer, , float, f, 32, 2);
 
   /* Choose init value arbitrarily.  */
@@ -57,6 +60,7 @@ void exec_vcombine (void)
   VDUP(vector64_b, , uint, u, 64, 1, 0x88);
   VDUP(vector64_b, , poly, p, 8, 8, 0x55);
   VDUP(vector64_b, , poly, p, 16, 4, 0x66);
+  VDUP(vector64_b, , float, f, 16, 4, 2.25);
   VDUP(vector64_b, , float, f, 32, 2, 3.3f);
 
   clean_results ();
@@ -72,6 +76,7 @@ void exec_vcombine (void)
   TEST_VCOMBINE(uint, u, 64, 1, 2);
   TEST_VCOMBINE(poly, p, 8, 8, 16);
   TEST_VCOMBINE(poly, p, 16, 4, 8);
+  TEST_VCOMBINE(float, f, 16, 4, 8);
   TEST_VCOMBINE(float, f, 32, 2, 4);
 
   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
index b2289d3..9aafbc3 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x123456789abcdef0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xde, 0xbc, 0x9a,
 					0x78, 0x56, 0x34, 0x12 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x9abcdef0, 0x12345678 };
 
 #define INSN_NAME vcreate
@@ -38,6 +39,7 @@ FNNAME (INSN_NAME)
   DECL_VAL(val, int, 16, 4);
   DECL_VAL(val, int, 32, 2);
   DECL_VAL(val, int, 64, 1);
+  DECL_VAL(val, float, 16, 4);
   DECL_VAL(val, float, 32, 2);
   DECL_VAL(val, uint, 8, 8);
   DECL_VAL(val, uint, 16, 4);
@@ -50,6 +52,7 @@ FNNAME (INSN_NAME)
   DECL_VARIABLE(vector_res, int, 16, 4);
   DECL_VARIABLE(vector_res, int, 32, 2);
   DECL_VARIABLE(vector_res, int, 64, 1);
+  DECL_VARIABLE(vector_res, float, 16, 4);
   DECL_VARIABLE(vector_res, float, 32, 2);
   DECL_VARIABLE(vector_res, uint, 8, 8);
   DECL_VARIABLE(vector_res, uint, 16, 4);
@@ -65,6 +68,7 @@ FNNAME (INSN_NAME)
   VECT_VAR(val, int, 16, 4) = 0x123456789abcdef0LL;
   VECT_VAR(val, int, 32, 2) = 0x123456789abcdef0LL;
   VECT_VAR(val, int, 64, 1) = 0x123456789abcdef0LL;
+  VECT_VAR(val, float, 16, 4) = 0x123456789abcdef0LL;
   VECT_VAR(val, float, 32, 2) = 0x123456789abcdef0LL;
   VECT_VAR(val, uint, 8, 8) = 0x123456789abcdef0ULL;
   VECT_VAR(val, uint, 16, 4) = 0x123456789abcdef0ULL;
@@ -76,6 +80,7 @@ FNNAME (INSN_NAME)
   TEST_VCREATE(int, s, 8, 8);
   TEST_VCREATE(int, s, 16, 4);
   TEST_VCREATE(int, s, 32, 2);
+  TEST_VCREATE(float, f, 16, 4);
   TEST_VCREATE(float, f, 32, 2);
   TEST_VCREATE(int, s, 64, 1);
   TEST_VCREATE(uint, u, 8, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
index b5132f4..22d45d5 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
@@ -187,13 +187,13 @@ void exec_vdup_vmov (void)
 
     switch (i) {
     case 0:
-      CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
+      CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
       break;
     case 1:
-      CHECK_RESULTS_NAMED (TEST_MSG, expected1, "");
+      CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected1, "");
       break;
     case 2:
-      CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+      CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected2, "");
       break;
     default:
       abort();
@@ -232,13 +232,13 @@ void exec_vdup_vmov (void)
 
     switch (i) {
     case 0:
-      CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
+      CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
       break;
     case 1:
-      CHECK_RESULTS_NAMED (TEST_MSG, expected1, "");
+      CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected1, "");
       break;
     case 2:
-      CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
+      CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected2, "");
       break;
     default:
       abort();
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
index c1ff6dd..ef708dc 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
@@ -90,7 +90,7 @@ void exec_vdup_lane (void)
   TEST_VDUP_LANE(q, poly, p, 16, 8, 4, 1);
   TEST_VDUP_LANE(q, float, f, 32, 4, 2, 1);
 
-  CHECK_RESULTS (TEST_MSG, "");
+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
 }
 
 int main (void)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
index 0b014eb..98f88a6 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
@@ -113,7 +113,7 @@ void exec_vext (void)
   TEST_VEXT(q, poly, p, 16, 8, 6);
   TEST_VEXT(q, float, f, 32, 4, 3);
 
-  CHECK_RESULTS (TEST_MSG, "");
+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
 }
 
 int main (void)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
index d758112..eeef870 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
 #define TEST_MSG "VGET_HIGH"
@@ -31,6 +32,7 @@ void exec_vget_high (void)
   DECL_VARIABLE_128BITS_VARIANTS(vector128);
 
   TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector128, buffer);
+  VLOAD(vector128, buffer, q, float, f, 16, 8);
   VLOAD(vector128, buffer, q, float, f, 32, 4);
 
   clean_results ();
@@ -46,6 +48,7 @@ void exec_vget_high (void)
   TEST_VGET_HIGH(uint, u, 64, 1, 2);
   TEST_VGET_HIGH(poly, p, 8, 8, 16);
   TEST_VGET_HIGH(poly, p, 16, 4, 8);
+  TEST_VGET_HIGH(float, f, 16, 4, 8);
   TEST_VGET_HIGH(float, f, 32, 2, 4);
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
index 12ecfc2..0a81c5b 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
 #define TEST_MSG "VGET_LOW"
@@ -31,6 +32,7 @@ void exec_vget_low (void)
   DECL_VARIABLE_128BITS_VARIANTS(vector128);
 
   TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector128, buffer);
+  VLOAD(vector128, buffer, q, float, f, 16, 8);
   VLOAD(vector128, buffer, q, float, f, 32, 4);
 
   clean_results ();
@@ -46,6 +48,7 @@ void exec_vget_low (void)
   TEST_VGET_LOW(uint, u, 64, 1, 2);
   TEST_VGET_LOW(poly, p, 8, 8, 16);
   TEST_VGET_LOW(poly, p, 16, 4, 8);
+  TEST_VGET_LOW(float, f, 16, 4, 8);
   TEST_VGET_LOW(float, f, 32, 2, 4);
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
index ced9d73..68641b0 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7,
@@ -44,6 +45,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+					   0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0xc1600000, 0xc1500000 };
 
@@ -62,7 +65,9 @@ void exec_vld1 (void)
 
   TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1, vector, buffer);
 
+  TEST_VLD1(vector, buffer, , float, f, 16, 4);
   TEST_VLD1(vector, buffer, , float, f, 32, 2);
+  TEST_VLD1(vector, buffer, q, float, f, 16, 8);
   TEST_VLD1(vector, buffer, q, float, f, 32, 4);
 
   CHECK_RESULTS (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
index 0e05274..bf878cc 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
@@ -17,6 +17,7 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0,
@@ -44,6 +45,8 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					  0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00,
+					    0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1800000,
 					    0xc1800000, 0xc1800000 };
 
@@ -61,6 +64,7 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1,
@@ -88,6 +92,8 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					  0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80,
+					    0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
 					    0xc1700000, 0xc1700000 };
 
@@ -105,6 +111,7 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected2,hfloat,16,4) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
 VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2,
@@ -132,6 +139,8 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					  0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+VECT_VAR_DECL(expected2,hfloat,16,8) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00,
+					    0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0xc1600000,
 					    0xc1600000, 0xc1600000 };
 
@@ -154,7 +163,9 @@ void exec_vld1_dup (void)
 
     TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1_DUP, vector, buffer_dup);
 
+    TEST_VLD1_DUP(vector, buffer_dup, , float, f, 16, 4);
     TEST_VLD1_DUP(vector, buffer_dup, , float, f, 32, 2);
+    TEST_VLD1_DUP(vector, buffer_dup, q, float, f, 16, 8);
     TEST_VLD1_DUP(vector, buffer_dup, q, float, f, 32, 4);
 
     switch (i) {
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
index d5c5d22..3b521f7 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_lane.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					0xaa, 0xaa, 0xaa, 0xf0 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xaaaa };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xaaaaaaaa, 0xc1800000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					0xaa, 0xaa, 0xaa, 0xaa,
@@ -43,6 +44,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					 0xf0, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					 0xaaaa, 0xaaaa, 0xfff0, 0xaaaa };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+					   0xaaaa, 0xcc00, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 					   0xc1800000, 0xaaaaaaaa };
 
@@ -72,6 +75,7 @@ void exec_vld1_lane (void)
   ARRAY(buffer_src, uint, 64, 1);
   ARRAY(buffer_src, poly, 8, 8);
   ARRAY(buffer_src, poly, 16, 4);
+  ARRAY(buffer_src, float, 16, 4);
   ARRAY(buffer_src, float, 32, 2);
 
   ARRAY(buffer_src, int, 8, 16);
@@ -84,6 +88,7 @@ void exec_vld1_lane (void)
   ARRAY(buffer_src, uint, 64, 2);
   ARRAY(buffer_src, poly, 8, 16);
   ARRAY(buffer_src, poly, 16, 8);
+  ARRAY(buffer_src, float, 16, 8);
   ARRAY(buffer_src, float, 32, 4);
 
   clean_results ();
@@ -99,6 +104,7 @@ void exec_vld1_lane (void)
   TEST_VLD1_LANE(, uint, u, 64, 1, 0);
   TEST_VLD1_LANE(, poly, p, 8, 8, 7);
   TEST_VLD1_LANE(, poly, p, 16, 4, 3);
+  TEST_VLD1_LANE(, float, f, 16, 4, 2);
   TEST_VLD1_LANE(, float, f, 32, 2, 1);
 
   TEST_VLD1_LANE(q, int, s, 8, 16, 15);
@@ -111,6 +117,7 @@ void exec_vld1_lane (void)
   TEST_VLD1_LANE(q, uint, u, 64, 2, 0);
   TEST_VLD1_LANE(q, poly, p, 8, 16, 12);
   TEST_VLD1_LANE(q, poly, p, 16, 8, 6);
+  TEST_VLD1_LANE(q, float, f, 16, 8, 5);
   TEST_VLD1_LANE(q, float, f, 32, 4, 2);
 
   CHECK_RESULTS (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
index f20aa03..937b51d 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
@@ -18,6 +18,7 @@ VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7,
@@ -41,6 +42,8 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -58,6 +61,7 @@ VECT_VAR_DECL(expected_vld2_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld2_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7,
@@ -81,6 +85,8 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
+						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
 						  0xc1200000, 0xc1100000 };
 
@@ -98,6 +104,7 @@ VECT_VAR_DECL(expected_vld3_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7,
@@ -121,6 +128,8 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -138,6 +147,7 @@ VECT_VAR_DECL(expected_vld3_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld3_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7,
@@ -161,6 +171,8 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
+						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
 						  0xc1200000, 0xc1100000 };
 
@@ -181,6 +193,7 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff8, 0xfff9,
 						0xfffa, 0xfffb };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld3_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 					       0x14, 0x15, 0x16, 0x17,
@@ -204,6 +217,8 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 						0x1c, 0x1d, 0x1e, 0x1f };
 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0x0, 0x1, 0x2, 0x3,
 						0x4, 0x5, 0x6, 0x7 };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0x0000, 0x3c00, 0x4000, 0x4200,
+						  0x4400, 0x4500, 0x4600, 0x4700 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0xc1000000, 0xc0e00000,
 						  0xc0c00000, 0xc0a00000 };
 
@@ -223,6 +238,7 @@ VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7,
@@ -246,6 +262,8 @@ VECT_VAR_DECL(expected_vld4_0,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 						0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 						0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -263,6 +281,7 @@ VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7,
@@ -286,6 +305,8 @@ VECT_VAR_DECL(expected_vld4_1,poly,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
 						0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb,
 						0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0xc800, 0xc700, 0xc600, 0xc500,
+						  0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
 						  0xc1200000, 0xc1100000 };
 
@@ -303,6 +324,7 @@ VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld4_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 					       0x14, 0x15, 0x16, 0x17,
@@ -326,6 +348,8 @@ VECT_VAR_DECL(expected_vld4_2,poly,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
 						0x1c, 0x1d, 0x1e, 0x1f };
 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0x0, 0x1, 0x2, 0x3,
 						0x4, 0x5, 0x6, 0x7 };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0x0000, 0x3c00, 0x4000, 0x4200,
+						  0x4400, 0x4500, 0x4600, 0x4700 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0xc1000000, 0xc0e00000,
 						  0xc0c00000, 0xc0a00000 };
 
@@ -343,6 +367,7 @@ VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0x8, 0x9, 0xa, 0xb,
 					       0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfffc, 0xfffd, 0xfffe, 0xffff };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1200000, 0xc1100000 };
 VECT_VAR_DECL(expected_vld4_3,int,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
 					       0x24, 0x25, 0x26, 0x27,
@@ -366,6 +391,8 @@ VECT_VAR_DECL(expected_vld4_3,poly,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
 						0x2c, 0x2d, 0x2e, 0x2f };
 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0x8, 0x9, 0xa, 0xb,
 						0xc, 0xd, 0xe, 0xf };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0x4800, 0x4880, 0x4900, 0x4980,
+						  0x4a00, 0x4a80, 0x4b00, 0x4b80 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0xc0800000, 0xc0400000,
 						  0xc0000000, 0xbf800000 };
 
@@ -409,6 +436,7 @@ void exec_vldX (void)
   DECL_VLDX(uint, 64, 1, X);			\
   DECL_VLDX(poly, 8, 8, X);			\
   DECL_VLDX(poly, 16, 4, X);			\
+  DECL_VLDX(float, 16, 4, X);			\
   DECL_VLDX(float, 32, 2, X);			\
   DECL_VLDX(int, 8, 16, X);			\
   DECL_VLDX(int, 16, 8, X);			\
@@ -418,6 +446,7 @@ void exec_vldX (void)
   DECL_VLDX(uint, 32, 4, X);			\
   DECL_VLDX(poly, 8, 16, X);			\
   DECL_VLDX(poly, 16, 8, X);			\
+  DECL_VLDX(float, 16, 8, X);			\
   DECL_VLDX(float, 32, 4, X)
 
 #define TEST_ALL_VLDX(X)			\
@@ -431,6 +460,7 @@ void exec_vldX (void)
   TEST_VLDX(, uint, u, 64, 1, X);		\
   TEST_VLDX(, poly, p, 8, 8, X);		\
   TEST_VLDX(, poly, p, 16, 4, X);		\
+  TEST_VLDX(, float, f, 16, 4, X);		\
   TEST_VLDX(, float, f, 32, 2, X);		\
   TEST_VLDX(q, int, s, 8, 16, X);		\
   TEST_VLDX(q, int, s, 16, 8, X);		\
@@ -440,6 +470,7 @@ void exec_vldX (void)
   TEST_VLDX(q, uint, u, 32, 4, X);		\
   TEST_VLDX(q, poly, p, 8, 16, X);		\
   TEST_VLDX(q, poly, p, 16, 8, X);		\
+  TEST_VLDX(q, float, f, 16, 8, X);		\
   TEST_VLDX(q, float, f, 32, 4, X)
 
 #define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
@@ -453,6 +484,7 @@ void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(int, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
@@ -462,6 +494,7 @@ void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
   /* vldX supports all vector types except [u]int64x2.  */
@@ -516,6 +549,8 @@ void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
+  PAD(buffer_vld2_pad, float, 16, 4);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
   PAD(buffer_vld2_pad, float, 32, 2);
 
@@ -539,6 +574,8 @@ void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
+  PAD(buffer_vld2_pad, float, 16, 8);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
   PAD(buffer_vld2_pad, float, 32, 4);
 
@@ -563,6 +600,8 @@ void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
+  PAD(buffer_vld3_pad, float, 16, 4);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
   PAD(buffer_vld3_pad, float, 32, 2);
 
@@ -586,6 +625,8 @@ void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
+  PAD(buffer_vld3_pad, float, 16, 8);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
   PAD(buffer_vld3_pad, float, 32, 4);
 
@@ -610,6 +651,8 @@ void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
+  PAD(buffer_vld4_pad, float, 16, 4);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
   PAD(buffer_vld4_pad, float, 32, 2);
 
@@ -633,6 +676,8 @@ void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
+  PAD(buffer_vld4_pad, float, 16, 8);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
   PAD(buffer_vld4_pad, float, 32, 4);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
index c66dade..1496717 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
@@ -18,6 +18,7 @@ VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = {0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
 /* vld2_dup/chunk 1.  */
@@ -35,6 +36,7 @@ VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					       0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
 /* vld3_dup/chunk 0.  */
@@ -54,6 +56,7 @@ VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
 					       0xf1, 0xf2, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff2, 0xfff0 };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xcc00 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
 /* vld3_dup/chunk 1.  */
@@ -73,6 +76,7 @@ VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
 					       0xf0, 0xf1, 0xf2, 0xf0 };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff1, 0xfff2,
 						0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xcb80, 0xcb00, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1800000 };
 
 /* vld3_dup/chunk 2.  */
@@ -92,6 +96,7 @@ VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
 					       0xf2, 0xf0, 0xf1, 0xf2 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff2, 0xfff0,
 						0xfff1, 0xfff2 };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xcc00, 0xcb80, 0xcb00 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1700000, 0xc1600000 };
 
 /* vld4_dup/chunk 0.  */
@@ -109,6 +114,7 @@ VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
 /* vld4_dup/chunk 1.  */
@@ -125,6 +131,7 @@ VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
 /* vld4_dup/chunk 2.  */
@@ -141,6 +148,7 @@ VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
 /* vld4_dup/chunk3.  */
@@ -157,6 +165,7 @@ VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
 void exec_vldX_dup (void)
@@ -199,6 +208,7 @@ void exec_vldX_dup (void)
   DECL_VLDX_DUP(uint, 64, 1, X);		\
   DECL_VLDX_DUP(poly, 8, 8, X);			\
   DECL_VLDX_DUP(poly, 16, 4, X);		\
+  DECL_VLDX_DUP(float, 16, 4, X);		\
   DECL_VLDX_DUP(float, 32, 2, X)
 
 #define TEST_ALL_VLDX_DUP(X)			\
@@ -212,6 +222,7 @@ void exec_vldX_dup (void)
   TEST_VLDX_DUP(, uint, u, 64, 1, X);		\
   TEST_VLDX_DUP(, poly, p, 8, 8, X);		\
   TEST_VLDX_DUP(, poly, p, 16, 4, X);		\
+  TEST_VLDX_DUP(, float, f, 16, 4, X);		\
   TEST_VLDX_DUP(, float, f, 32, 2, X)
 
 #define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
@@ -225,6 +236,7 @@ void exec_vldX_dup (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y)
 
   /* vldX_dup supports only 64-bit inputs.  */
@@ -269,6 +281,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
+  PAD(buffer_vld2_pad, float, 16, 4);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
   PAD(buffer_vld2_pad, float, 32, 2);
 
@@ -292,6 +306,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+  VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
+  PAD(buffer_vld2_pad, float, 16, 8);
   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
   PAD(buffer_vld2_pad, float, 32, 4);
 
@@ -316,6 +332,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
+  PAD(buffer_vld3_pad, float, 16, 4);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
   PAD(buffer_vld3_pad, float, 32, 2);
 
@@ -339,6 +357,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+  VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
+  PAD(buffer_vld3_pad, float, 16, 8);
   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
   PAD(buffer_vld3_pad, float, 32, 4);
 
@@ -363,6 +383,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
+  PAD(buffer_vld4_pad, float, 16, 4);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
   PAD(buffer_vld4_pad, float, 32, 2);
 
@@ -386,6 +408,8 @@ void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+  VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
+  PAD(buffer_vld4_pad, float, 16, 8);
   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
   PAD(buffer_vld4_pad, float, 32, 4);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
index 2f2e62f..5e8eb44 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
@@ -18,6 +18,7 @@ VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -29,6 +30,8 @@ VECT_VAR_DECL(expected_vld2_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 						0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa } ;
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -44,6 +47,7 @@ VECT_VAR_DECL(expected_vld2_1,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld2_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xfff0, 0xfff1, 0xaaaa, 0xaaaa };
@@ -55,6 +59,8 @@ VECT_VAR_DECL(expected_vld2_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -70,6 +76,7 @@ VECT_VAR_DECL(expected_vld3_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -81,6 +88,8 @@ VECT_VAR_DECL(expected_vld3_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 						0xfffffff2, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -96,6 +105,7 @@ VECT_VAR_DECL(expected_vld3_1,uint,32,2) [] = { 0xaaaaaaaa, 0xfffffff0 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xf0, 0xf1, 0xf2, 0xaa };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -107,6 +117,8 @@ VECT_VAR_DECL(expected_vld3_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
+VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xc1800000, 0xc1700000 };
 
@@ -122,6 +134,7 @@ VECT_VAR_DECL(expected_vld3_2,uint,32,2) [] = { 0xfffffff1, 0xfffffff2 };
 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
 					       0xfff2, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -133,6 +146,8 @@ VECT_VAR_DECL(expected_vld3_2,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0xfff1, 0xfff2, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80,
+						  0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0xc1600000, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -148,6 +163,7 @@ VECT_VAR_DECL(expected_vld4_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -159,6 +175,8 @@ VECT_VAR_DECL(expected_vld4_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 						0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -174,6 +192,7 @@ VECT_VAR_DECL(expected_vld4_1,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -185,6 +204,8 @@ VECT_VAR_DECL(expected_vld4_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -200,6 +221,7 @@ VECT_VAR_DECL(expected_vld4_2,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -211,6 +233,8 @@ VECT_VAR_DECL(expected_vld4_2,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 						  0xc1600000, 0xc1500000 };
 
@@ -226,6 +250,7 @@ VECT_VAR_DECL(expected_vld4_3,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_3,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
@@ -237,6 +262,8 @@ VECT_VAR_DECL(expected_vld4_3,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
 						  0xaaaaaaaa, 0xaaaaaaaa };
 
@@ -252,6 +279,7 @@ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
+VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
 
 /* Input buffers for vld3_lane */
@@ -265,6 +293,7 @@ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
+VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
 
 /* Input buffers for vld4_lane */
@@ -278,6 +307,7 @@ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
+VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
 
 void exec_vldX_lane (void)
@@ -335,7 +365,9 @@ void exec_vldX_lane (void)
   DECL_VLDX_LANE(uint, 16, 8, X);		\
   DECL_VLDX_LANE(uint, 32, 4, X);		\
   DECL_VLDX_LANE(poly, 16, 8, X);		\
+  DECL_VLDX_LANE(float, 16, 4, X);		\
   DECL_VLDX_LANE(float, 32, 2, X);		\
+  DECL_VLDX_LANE(float, 16, 8, X);		\
   DECL_VLDX_LANE(float, 32, 4, X)
 
   /* Add some padding to try to catch out of bound accesses.  */
@@ -360,7 +392,9 @@ void exec_vldX_lane (void)
   TEST_VLDX_LANE(q, uint, u, 16, 8, X, 5);	\
   TEST_VLDX_LANE(q, uint, u, 32, 4, X, 0);	\
   TEST_VLDX_LANE(q, poly, p, 16, 8, X, 5);	\
+  TEST_VLDX_LANE(, float, f, 16, 4, X, 2);	\
   TEST_VLDX_LANE(, float, f, 32, 2, X, 0);	\
+  TEST_VLDX_LANE(q, float, f, 16, 8, X, 6);	\
   TEST_VLDX_LANE(q, float, f, 32, 4, X, 2)
 
 #define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
@@ -377,7 +411,9 @@ void exec_vldX_lane (void)
   TEST_EXTRA_CHUNK(uint, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
+  TEST_EXTRA_CHUNK(float, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
   /* vldX_lane supports only a subset of all variants.  */
@@ -419,7 +455,9 @@ void exec_vldX_lane (void)
   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
+  DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
   DUMMY_ARRAY(buffer_src, float, 32, 2, 4);
+  DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
 
   /* Check vld2_lane/vld2q_lane.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c
index 5159406..99cdf6d 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vset_lane.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0x88 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0x55, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0x66, 0xfff3 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0x4840, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x4204cccd };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7,
@@ -41,6 +42,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xfc, 0xfd, 0xdd, 0xff };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0xfff4, 0xfff5, 0xee, 0xfff7 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
+					   0xca00, 0x4480, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0xc1600000, 0x41333333 };
 
@@ -61,7 +64,9 @@ void exec_vset_lane (void)
 
   /* Initialize input "vector" from "buffer".  */
   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+  VLOAD(vector, buffer, , float, f, 16, 4);
   VLOAD(vector, buffer, , float, f, 32, 2);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
   VLOAD(vector, buffer, q, float, f, 32, 4);
 
   /* Choose value and lane arbitrarily.  */
@@ -75,6 +80,7 @@ void exec_vset_lane (void)
   TEST_VSET_LANE(, uint, u, 64, 1, 0x88, 0);
   TEST_VSET_LANE(, poly, p, 8, 8, 0x55, 6);
   TEST_VSET_LANE(, poly, p, 16, 4, 0x66, 2);
+  TEST_VSET_LANE(, float, f, 16, 4, 8.5f, 2);
   TEST_VSET_LANE(, float, f, 32, 2, 33.2f, 1);
 
   TEST_VSET_LANE(q, int, s, 8, 16, 0x99, 15);
@@ -87,6 +93,7 @@ void exec_vset_lane (void)
   TEST_VSET_LANE(q, uint, u, 64, 2, 0x11, 1);
   TEST_VSET_LANE(q, poly, p, 8, 16, 0xDD, 14);
   TEST_VSET_LANE(q, poly, p, 16, 8, 0xEE, 6);
+  TEST_VSET_LANE(q, float, f, 16, 8, 4.5f, 5);
   TEST_VSET_LANE(q, float, f, 32, 4, 11.2f, 3);
 
   CHECK_RESULTS(TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
index 08583b8..51d3fbe 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
@@ -16,6 +16,7 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf6, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff2, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcb80, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xff, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33,
@@ -42,6 +43,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xfa, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff4, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xc900, 0x3333, 0x3333, 0x3333,
+					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1700000, 0x33333333,
 					   0x33333333, 0x33333333 };
 
@@ -69,6 +72,7 @@ void exec_vst1_lane (void)
   TEST_VST1_LANE(, uint, u, 64, 1, 0);
   TEST_VST1_LANE(, poly, p, 8, 8, 6);
   TEST_VST1_LANE(, poly, p, 16, 4, 2);
+  TEST_VST1_LANE(, float, f, 16, 4, 1);
   TEST_VST1_LANE(, float, f, 32, 2, 1);
 
   TEST_VST1_LANE(q, int, s, 8, 16, 15);
@@ -81,6 +85,7 @@ void exec_vst1_lane (void)
   TEST_VST1_LANE(q, uint, u, 64, 2, 0);
   TEST_VST1_LANE(q, poly, p, 8, 16, 10);
   TEST_VST1_LANE(q, poly, p, 16, 8, 4);
+  TEST_VST1_LANE(q, float, f, 16, 8, 6);
   TEST_VST1_LANE(q, float, f, 32, 4, 1);
 
   CHECK_RESULTS(TEST_MSG, "");

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 16/16][ARM/AArch64 Testsuite] Add test of vcvt{,_high}_{f16_f32,f32_f16}
  2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
                   ` (14 preceding siblings ...)
  2015-07-07 12:38 ` [PATCH 14/16][ARM/AArch64 testsuite] Update advsimd-intrinsics tests to add float16 vectors Alan Lawrence
@ 2015-07-07 12:39 ` Alan Lawrence
  15 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 12:39 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 799 bytes --]

This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01349.html . 
Changes are to:

use #if defined(__aarch64__) rather than __ARM_64BIT_STATE__;
add an initial call to clean_results;
use a different mechanism for adding -mfpu=neon-fp16 on ARM (specifically: we 
try to add that flag for all tests, as AFAICT that is valid anywhere -mfpu=neon 
is valid; and bail out of the vcvt_f16 test, the only test that actually 
requires fp16 H/W, if unsuccessful e.g. if a -mfpu=neon was forced on the 
command-line). This is because the rightmost -mfpu option overrides the previous.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp:
	set additional flags for neon-fp16 support.
	* gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c: New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 16_test_vcvt.patch --]
[-- Type: text/x-patch; name=16_test_vcvt.patch, Size: 5467 bytes --]

commit e6cc7467ddf5702d3a122b8ac4163621d0164b37
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Wed Jan 28 13:02:22 2015 +0000

    v2 Test vcvt{,_high on aarch64}_f{32_f16,16_f32}, with neon-fp16 for ARM targets.
    
    v2a: #if defined(__aarch64__); + clean_results(); fp16 opts for ARM; fp16_hw_ok

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
index ceada83..5f5e1fe 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
@@ -52,8 +52,10 @@ if {[istarget arm*-*-*]} then {
 torture-init
 set-torture-options $C_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS
 
-# Make sure Neon flags are provided, if necessary.
-set additional_flags [add_options_for_arm_neon ""]
+# Make sure Neon flags are provided, if necessary. We try to add FP16 flags
+# for all tests; tests requiring FP16 will abort if a non-FP16 option
+# was forced.
+set additional_flags [add_options_for_arm_neon_fp16 ""]
 
 # Main loop.
 gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c
new file mode 100644
index 0000000..7a1c256
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_f16.c
@@ -0,0 +1,98 @@
+/* { dg-require-effective-target arm_neon_fp16_hw_ok { target { arm*-*-* } } } */
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include <math.h>
+
+/* Expected results for vcvt.  */
+VECT_VAR_DECL (expected,hfloat,32,4) [] = { 0x41800000, 0x41700000,
+					    0x41600000, 0x41500000 };
+VECT_VAR_DECL (expected,hfloat,16,4) [] = { 0x3e00, 0x4100, 0x4300, 0x4480 };
+
+/* Expected results for vcvt_high_f32_f16.  */
+VECT_VAR_DECL (expected_high,hfloat,32,4) [] = { 0xc1400000, 0xc1300000,
+						 0xc1200000, 0xc1100000 };
+/* Expected results for vcvt_high_f16_f32.  */
+VECT_VAR_DECL (expected_high,hfloat,16,8) [] = { 0x4000, 0x4000, 0x4000, 0x4000,
+						 0xcc00, 0xcb80, 0xcb00, 0xca80 };
+
+void
+exec_vcvt (void)
+{
+  clean_results();
+
+#define TEST_MSG vcvt_f32_f16
+  {
+    VECT_VAR_DECL (buffer_src, float, 16, 4) [] = { 16.0, 15.0, 14.0, 13.0 };
+
+    DECL_VARIABLE (vector_src, float, 16, 4);
+
+    VLOAD (vector_src, buffer_src, , float, f, 16, 4);
+    DECL_VARIABLE (vector_res, float, 32, 4) =
+	vcvt_f32_f16 (VECT_VAR (vector_src, float, 16, 4));
+    vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	       VECT_VAR (vector_res, float, 32, 4));
+
+    CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected, "");
+  }
+#undef TEST_MSG
+
+  clean_results ();
+
+#define TEST_MSG vcvt_f16_f32
+  {
+    VECT_VAR_DECL (buffer_src, float, 32, 4) [] = { 1.5, 2.5, 3.5, 4.5 };
+    DECL_VARIABLE (vector_src, float, 32, 4);
+
+    VLOAD (vector_src, buffer_src, q, float, f, 32, 4);
+    DECL_VARIABLE (vector_res, float, 16, 4) =
+      vcvt_f16_f32 (VECT_VAR (vector_src, float, 32, 4));
+    vst1_f16 (VECT_VAR (result, float, 16, 4),
+	      VECT_VAR (vector_res, float, 16 ,4));
+
+    CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected, "");
+  }
+#undef TEST_MSG
+
+#if defined(__aarch64__)
+  clean_results ();
+
+#define TEST_MSG "vcvt_high_f32_f16"
+  {
+    DECL_VARIABLE (vector_src, float, 16, 8);
+    VLOAD (vector_src, buffer, q, float, f, 16, 8);
+    DECL_VARIABLE (vector_res, float, 32, 4);
+    VECT_VAR (vector_res, float, 32, 4) =
+      vcvt_high_f32_f16 (VECT_VAR (vector_src, float, 16, 8));
+    vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	       VECT_VAR (vector_res, float, 32, 4));
+    CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected_high, "");
+  }
+#undef TEST_MSG
+  clean_results ();
+
+#define TEST_MSG "vcvt_high_f16_f32"
+  {
+    DECL_VARIABLE (vector_low, float, 16, 4);
+    VDUP (vector_low, , float, f, 16, 4, 2.0);
+
+    DECL_VARIABLE (vector_src, float, 32, 4);
+    VLOAD (vector_src, buffer, q, float, f, 32, 4);
+
+    DECL_VARIABLE (vector_res, float, 16, 8) =
+      vcvt_high_f16_f32 (VECT_VAR (vector_low, float, 16, 4),
+			 VECT_VAR (vector_src, float, 32, 4));
+    vst1q_f16 (VECT_VAR (result, float, 16, 8),
+	       VECT_VAR (vector_res, float, 16, 8));
+
+    CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_high, "");
+  }
+#endif
+}
+
+int
+main (void)
+{
+  exec_vcvt ();
+  return 0;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index f0c209f..591e022 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2705,6 +2705,21 @@ proc check_effective_target_arm_neon_fp16_ok { } {
 		check_effective_target_arm_neon_fp16_ok_nocache]
 }
 
+proc check_effective_target_arm_neon_fp16_hw_ok { } {
+    if {! [check_effective_target_arm_neon_fp16_ok] } {
+        return 0
+    }
+    global et_arm_neon_fp16_flags
+    check_runtime_nocache arm_neon_fp16_hw_ok {
+	int
+	main (int argc, char **argv)
+	{
+	  asm ("vcvt.f32.f16 q1, d0");
+	  return 0;
+	}
+    } $et_arm_neon_fp16_flags
+}
+
 proc add_options_for_arm_neon_fp16 { flags } {
     if { ! [check_effective_target_arm_neon_fp16_ok] } {
 	return "$flags"

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-07 12:34 ` [PATCH 3/16][ARM] Add float16x4_t intrinsics Alan Lawrence
@ 2015-07-07 13:09   ` Kyrill Tkachov
  2015-07-07 16:22     ` Kyrill Tkachov
  0 siblings, 1 reply; 35+ messages in thread
From: Kyrill Tkachov @ 2015-07-07 13:09 UTC (permalink / raw)
  To: Alan Lawrence, gcc-patches

Hi Alan,

On 07/07/15 13:34, Alan Lawrence wrote:
> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01335.html

For some context, the reference for these is at:
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf

This patch is ok once you and Charles decide on how to proceed with the two prerequisites.

Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-07 13:09   ` Kyrill Tkachov
@ 2015-07-07 16:22     ` Kyrill Tkachov
  2015-07-07 16:34       ` Alan Lawrence
  0 siblings, 1 reply; 35+ messages in thread
From: Kyrill Tkachov @ 2015-07-07 16:22 UTC (permalink / raw)
  To: Alan Lawrence, gcc-patches

On 07/07/15 14:09, Kyrill Tkachov wrote:
> Hi Alan,
>
> On 07/07/15 13:34, Alan Lawrence wrote:
>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01335.html
> For some context, the reference for these is at:
> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>
> This patch is ok once you and Charles decide on how to proceed with the two prerequisites.

On second thought, the ACLE document at http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf

says in 12.2.1:
"float16 types are only available when the __fp16 type is defined, i.e. when supported by the hardware"

This indicates that float16 type and intrinsic availability should be gated on the availability of fp16
in the specified -mfpu. Look at some existing intrinsics like vcvt_f16_f32 for a way to gate these.

I notice that the float32x4_t is unconditionally defined in our arm_neon.h, however.
I think this is a bug and its definition should be #ifdef'd properly as well.

Thanks,

Kyrill

>
> Thanks,
> Kyrill
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-07 16:22     ` Kyrill Tkachov
@ 2015-07-07 16:34       ` Alan Lawrence
  2015-07-07 16:52         ` Kyrill Tkachov
  0 siblings, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 16:34 UTC (permalink / raw)
  To: Kyrill Tkachov; +Cc: gcc-patches

Kyrill Tkachov wrote:
> On 07/07/15 14:09, Kyrill Tkachov wrote:
>> Hi Alan,
>>
>> On 07/07/15 13:34, Alan Lawrence wrote:
>>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01335.html
>> For some context, the reference for these is at:
>> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>>
>> This patch is ok once you and Charles decide on how to proceed with the two prerequisites.
> 
> On second thought, the ACLE document at http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf
> 
> says in 12.2.1:
> "float16 types are only available when the __fp16 type is defined, i.e. when supported by the hardware"

However, we support __fp16 whenever the user specifies -mfp16-format=ieee or 
-mfp16-format=alternative, regardless of whether we have hardware support or not.

(Without hardware support, gcc generates calls to  __gnu_f2h_ieee or 
__gnu_f2h_alternative instead of vcvtb.f16.f32, and  __gnu_h2f_ieee or 
__gnu_h2f_alternative instead of vcvtb.f32.f16. However, there is no way to 
support __fp16 just using those hardware instructions without caring about which 
format is in use.)

Thus we cannot be consistent with both sides of that 'i.e.', unless we also 
change when __fp16 is available.

> I notice that the float32x4_t is unconditionally defined in our arm_neon.h, however.
> I think this is a bug and its definition should be #ifdef'd properly as well.

Hmmm. Is this becoming a question of, which potentially-existing code do we want 
to break???

Cheers, Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-07 16:34       ` Alan Lawrence
@ 2015-07-07 16:52         ` Kyrill Tkachov
  2015-07-07 17:17           ` Alan Lawrence
  0 siblings, 1 reply; 35+ messages in thread
From: Kyrill Tkachov @ 2015-07-07 16:52 UTC (permalink / raw)
  To: Alan Lawrence
  Cc: gcc-patches, Ramana Radhakrishnan, Tejas Belagod, Richard Earnshaw


On 07/07/15 17:34, Alan Lawrence wrote:
> Kyrill Tkachov wrote:
>> On 07/07/15 14:09, Kyrill Tkachov wrote:
>>> Hi Alan,
>>>
>>> On 07/07/15 13:34, Alan Lawrence wrote:
>>>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01335.html
>>> For some context, the reference for these is at:
>>> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>>>
>>> This patch is ok once you and Charles decide on how to proceed with the two prerequisites.
>> On second thought, the ACLE document at http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf
>>
>> says in 12.2.1:
>> "float16 types are only available when the __fp16 type is defined, i.e. when supported by the hardware"
> However, we support __fp16 whenever the user specifies -mfp16-format=ieee or
> -mfp16-format=alternative, regardless of whether we have hardware support or not.
>
> (Without hardware support, gcc generates calls to  __gnu_f2h_ieee or
> __gnu_f2h_alternative instead of vcvtb.f16.f32, and  __gnu_h2f_ieee or
> __gnu_h2f_alternative instead of vcvtb.f32.f16. However, there is no way to
> support __fp16 just using those hardware instructions without caring about which
> format is in use.)

Hmmm... In my opinion intrinsics should aim to map to instructions rather than go away and
call library functions, but this is the existing functionality
that current users might depend on :(

>
> Thus we cannot be consistent with both sides of that 'i.e.', unless we also
> change when __fp16 is available.
>
>> I notice that the float32x4_t is unconditionally defined in our arm_neon.h, however.
>> I think this is a bug and its definition should be #ifdef'd properly as well.
> Hmmm. Is this becoming a question of, which potentially-existing code do we want
> to break???

CC'ing the ARM maintainers and Tejas for an ACLE perspective.
I think that we'd want to gate the definition of __fp16 on hardware availability as well
(the -mfpu option) rather than just arm_fp16_format but I'm not sure of the impact this will have
on existing users.

Kyrill

>
> Cheers, Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-07 16:52         ` Kyrill Tkachov
@ 2015-07-07 17:17           ` Alan Lawrence
  2015-07-08  8:35             ` Ramana Radhakrishnan
  0 siblings, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-07 17:17 UTC (permalink / raw)
  To: Kyrill Tkachov
  Cc: gcc-patches, Ramana Radhakrishnan, Tejas Belagod, Richard Earnshaw

Kyrill Tkachov wrote:
> On 07/07/15 17:34, Alan Lawrence wrote:
>> Kyrill Tkachov wrote:
>>> On 07/07/15 14:09, Kyrill Tkachov wrote:
>>>> Hi Alan,
>>>>
>>>> On 07/07/15 13:34, Alan Lawrence wrote:
>>>>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01335.html
>>>> For some context, the reference for these is at:
>>>> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>>>>
>>>> This patch is ok once you and Charles decide on how to proceed with the two prerequisites.
>>> On second thought, the ACLE document at http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf
>>>
>>> says in 12.2.1:
>>> "float16 types are only available when the __fp16 type is defined, i.e. when supported by the hardware"
>> However, we support __fp16 whenever the user specifies -mfp16-format=ieee or
>> -mfp16-format=alternative, regardless of whether we have hardware support or not.
>>
>> (Without hardware support, gcc generates calls to  __gnu_f2h_ieee or
>> __gnu_f2h_alternative instead of vcvtb.f16.f32, and  __gnu_h2f_ieee or
>> __gnu_h2f_alternative instead of vcvtb.f32.f16. However, there is no way to
>> support __fp16 just using those hardware instructions without caring about which
>> format is in use.)
> 
> Hmmm... In my opinion intrinsics should aim to map to instructions rather than go away and
> call library functions, but this is the existing functionality
> that current users might depend on :(

Sorry - to clarify: currently we generate __gnu_f2h_ieee / __gnu_h2f_ieee, to 
convert between single __fp16 and 'float' values, when there is no HW. General 
operations on scalar __fp16 values are performed by converting to float, 
performing operations on float, and converting back. The __fp16 type is 
available and "usable" without HW support, but only when -mfp16-format is specified.

(The existing) intrinsics operating on float16x[48] vectors (converting to/from 
float32x4) are *not* available without hardware support; these intrinsics *are* 
available without specifying -mfp16-format.

ACLE (4.1.2) allows toolchains to provide __fp16 when not implemented in HW, 
even if this is not required.

> CC'ing the ARM maintainers and Tejas for an ACLE perspective.
> I think that we'd want to gate the definition of __fp16 on hardware availability as well
> (the -mfpu option) rather than just arm_fp16_format but I'm not sure of the impact this will have
> on existing users.

Sure....but do we require -mfpu *and* -mfp16-format? s/and/or/ ?   Do we require 
-mfp16-format for float16x[48] intrinsics, or allow format-agnostic code (as HW 
support allows us to!)?

I don't have very strong opinions as to which way we should go, I merely tried 
to be consistent with the existing codebase, and to support as much code as 
possible, although I agree I ignored cases where defining functions unexpectedly 
might cause problems.

Cheers, Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-07 12:37 ` [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real Alan Lawrence
@ 2015-07-07 22:06   ` Jeff Law
  2015-07-08  9:43     ` Richard Biener
  0 siblings, 1 reply; 35+ messages in thread
From: Jeff Law @ 2015-07-07 22:06 UTC (permalink / raw)
  To: Alan Lawrence, gcc-patches; +Cc: Jakub Jelinek

On 07/07/2015 06:37 AM, Alan Lawrence wrote:
> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01346.html. Fixes
> FAIL of advsimd-intrinsics vcreate.c on aarch64_be-none-elf from
> previous patch.
>
> 15_native_interpret_real.patch
>
>
> commit e2e7ca148960a82fc88128820f17e7cbd14173cb
> Author: Alan Lawrence<alan.lawrence@arm.com>
> Date:   Thu Apr 9 10:54:40 2015 +0100
>
>      Fix native_interpret_real for HFmode floats on Bigendian with UNITS_PER_WORD>=4
>
>      (with missing space)
OK with ChangeLog in proper form.

jeff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-07 17:17           ` Alan Lawrence
@ 2015-07-08  8:35             ` Ramana Radhakrishnan
  2015-07-27 13:22               ` Alan Lawrence
  0 siblings, 1 reply; 35+ messages in thread
From: Ramana Radhakrishnan @ 2015-07-08  8:35 UTC (permalink / raw)
  To: Alan Lawrence, Kyrill Tkachov
  Cc: gcc-patches, Ramana Radhakrishnan, Tejas Belagod, Richard Earnshaw

I haven't seen the patch yet but here are my thoughts on where this should be going.

On 07/07/15 18:17, Alan Lawrence wrote:
> Kyrill Tkachov wrote:
>> On 07/07/15 17:34, Alan Lawrence wrote:
>>> Kyrill Tkachov wrote:
>>>> On 07/07/15 14:09, Kyrill Tkachov wrote:
>>>>> Hi Alan,
>>>>>
>>>>> On 07/07/15 13:34, Alan Lawrence wrote:
>>>>>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01335.html
>>>>> For some context, the reference for these is at:
>>>>> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>>>>>
>>>>> This patch is ok once you and Charles decide on how to proceed with the two prerequisites.
>>>> On second thought, the ACLE document at http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf
>>>>
>>>> says in 12.2.1:
>>>> "float16 types are only available when the __fp16 type is defined, i.e. when supported by the hardware"
>>> However, we support __fp16 whenever the user specifies -mfp16-format=ieee or
>>> -mfp16-format=alternative, regardless of whether we have hardware support or not.
>>>
>>> (Without hardware support, gcc generates calls to  __gnu_f2h_ieee or
>>> __gnu_f2h_alternative instead of vcvtb.f16.f32, and  __gnu_h2f_ieee or
>>> __gnu_h2f_alternative instead of vcvtb.f32.f16. However, there is no way to
>>> support __fp16 just using those hardware instructions without caring about which
>>> format is in use.)
>>
>> Hmmm... In my opinion intrinsics should aim to map to instructions rather than go away and
>> call library functions, but this is the existing functionality
>> that current users might depend on :(
> 
> Sorry - to clarify: currently we generate __gnu_f2h_ieee / __gnu_h2f_ieee, to convert between single __fp16 and 'float' values, when there is no HW. General operations on scalar __fp16 values are performed by converting to float, performing operations on float, and converting back. The __fp16 type is available and "usable" without HW support, but only when -mfp16-format is specified.
> 
> (The existing) intrinsics operating on float16x[48] vectors (converting to/from float32x4) are *not* available without hardware support; these intrinsics *are* available without specifying -mfp16-format.
> 
> ACLE (4.1.2) allows toolchains to provide __fp16 when not implemented in HW, even if this is not required.

The type should exist with the presence of the SIMD unit and all the intrinsics that treat this as a bag of bits should just work (TM). The only intrinsics to be guarded by mfpu=neon-fp16 should really be the intrinsics for the instructions that interpret the 16 bits as float16 types.

> 
>> CC'ing the ARM maintainers and Tejas for an ACLE perspective.
>> I think that we'd want to gate the definition of __fp16 on hardware availability as well
>> (the -mfpu option) rather than just arm_fp16_format but I'm not sure of the impact this will have
>> on existing users.

This is just a storage format in the scalar world and the ACLE allows folks to have fp16 support without hardware. There are helper routines for that which were put in in the first place for this purpose.

> 
> Sure....but do we require -mfpu *and* -mfp16-format? s/and/or/ ?   Do we require -mfp16-format for float16x[48] intrinsics, or allow format-agnostic code (as HW support allows us to!)?
> 

I'd say we require the mfpu option for the intrinsics that interpret the float16 type but there is no bearing on the float16 format being chosen for this purpose, the reason being that the actual instruction being emitted takes care of doing the right thing as per the format specified by the AHP bit in the FPSCR - This is unlike the scalar case where the compiler *needs* to know the fp16-format that the user intended to use in order to call the correct emulation function.

Thus in summary - 

1. -mfpu=neon implies the presence of the float16x(4/8) types and all the intrinsics that treat these values as bags of bits.
2. -mfpu=neon-fp16 implies the presence of the vcvt* intrinsics that are needed for the float16 types.

Thoughts ?

regards
Ramana





> 
> Cheers, Alan
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-07 22:06   ` Jeff Law
@ 2015-07-08  9:43     ` Richard Biener
  2015-07-08 10:51       ` Alan Lawrence
  2015-07-09  2:20       ` Jeff Law
  0 siblings, 2 replies; 35+ messages in thread
From: Richard Biener @ 2015-07-08  9:43 UTC (permalink / raw)
  To: Jeff Law; +Cc: Alan Lawrence, gcc-patches, Jakub Jelinek

On Wed, Jul 8, 2015 at 12:07 AM, Jeff Law <law@redhat.com> wrote:
> On 07/07/2015 06:37 AM, Alan Lawrence wrote:
>>
>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01346.html. Fixes
>> FAIL of advsimd-intrinsics vcreate.c on aarch64_be-none-elf from
>> previous patch.
>>
>> 15_native_interpret_real.patch
>>
>>
>> commit e2e7ca148960a82fc88128820f17e7cbd14173cb
>> Author: Alan Lawrence<alan.lawrence@arm.com>
>> Date:   Thu Apr 9 10:54:40 2015 +0100
>>
>>      Fix native_interpret_real for HFmode floats on Bigendian with
>> UNITS_PER_WORD>=4
>>
>>      (with missing space)
>
> OK with ChangeLog in proper form.

Err - but now offset can become negative?  Shouldn't it rather error out
before as it requires at least 4 bytes for big-endian?

That said - the whole thing looks it doesn't expect GET_MODE_SIZE < 4
and your "fix" is just very obfuscated (if it really is a fix).

So, please cleanup the thing properly instead or at least add a big fat
comment.  There is the magic '3' in the line following yours as well.

Thanks,
Richard.

> jeff
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-08  9:43     ` Richard Biener
@ 2015-07-08 10:51       ` Alan Lawrence
  2015-07-08 12:44         ` Richard Biener
  2015-07-09  2:20       ` Jeff Law
  1 sibling, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-08 10:51 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Jakub Jelinek

Richard Biener wrote:
> On Wed, Jul 8, 2015 at 12:07 AM, Jeff Law <law@redhat.com> wrote:
>> On 07/07/2015 06:37 AM, Alan Lawrence wrote:
[snip]
>>>      Fix native_interpret_real for HFmode floats on Bigendian with
>>> UNITS_PER_WORD>=4
>>>
>>>      (with missing space)
>> OK with ChangeLog in proper form.
> 
> Err - but now offset can become negative?  Shouldn't it rather error out
> before as it requires at least 4 bytes for big-endian?

I don't think the offset can ever be negative; my reasoning is:

total_bytes = GET_MODE_SIZE (TYPE_MODE (type)) [set just before loop]
bitpos < total_bytes * BITS_PER_UNIT [condition of for loop]
byte = (bitpos / BITS_PER_UNIT) & 3 [first statement inside for loop]

==> byte < 3 && byte < total_bytes
==> byte < MIN (3, total_bytes)
==> byte <= MIN (3, total_bytes - 1)

> That said - the whole thing looks it doesn't expect GET_MODE_SIZE < 4
> and your "fix" is just very obfuscated (if it really is a fix).
> 
> So, please cleanup the thing properly instead or at least add a big fat
> comment.  There is the magic '3' in the line following yours as well.

Ok, I'll try to cleanup, I admit I'm not sure what all that code does 
(particularly if UNITS_PER_WORD < 4 !)...

--Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-08 10:51       ` Alan Lawrence
@ 2015-07-08 12:44         ` Richard Biener
  0 siblings, 0 replies; 35+ messages in thread
From: Richard Biener @ 2015-07-08 12:44 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: Jeff Law, gcc-patches, Jakub Jelinek

On Wed, Jul 8, 2015 at 12:51 PM, Alan Lawrence <alan.lawrence@arm.com> wrote:
> Richard Biener wrote:
>>
>> On Wed, Jul 8, 2015 at 12:07 AM, Jeff Law <law@redhat.com> wrote:
>>>
>>> On 07/07/2015 06:37 AM, Alan Lawrence wrote:
>
> [snip]
>>>>
>>>>      Fix native_interpret_real for HFmode floats on Bigendian with
>>>> UNITS_PER_WORD>=4
>>>>
>>>>      (with missing space)
>>>
>>> OK with ChangeLog in proper form.
>>
>>
>> Err - but now offset can become negative?  Shouldn't it rather error out
>> before as it requires at least 4 bytes for big-endian?
>
>
> I don't think the offset can ever be negative; my reasoning is:
>
> total_bytes = GET_MODE_SIZE (TYPE_MODE (type)) [set just before loop]
> bitpos < total_bytes * BITS_PER_UNIT [condition of for loop]
> byte = (bitpos / BITS_PER_UNIT) & 3 [first statement inside for loop]
>
> ==> byte < 3 && byte < total_bytes
> ==> byte < MIN (3, total_bytes)
> ==> byte <= MIN (3, total_bytes - 1)
>
>> That said - the whole thing looks it doesn't expect GET_MODE_SIZE < 4
>> and your "fix" is just very obfuscated (if it really is a fix).
>>
>> So, please cleanup the thing properly instead or at least add a big fat
>> comment.  There is the magic '3' in the line following yours as well.
>
>
> Ok, I'll try to cleanup, I admit I'm not sure what all that code does
> (particularly if UNITS_PER_WORD < 4 !)...

Yeah, me neither - but I'm trying to at least make sure it doesn't get worse ;)

Richard.

> --Alan
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-08  9:43     ` Richard Biener
  2015-07-08 10:51       ` Alan Lawrence
@ 2015-07-09  2:20       ` Jeff Law
  2015-07-09  9:34         ` Alan Lawrence
  1 sibling, 1 reply; 35+ messages in thread
From: Jeff Law @ 2015-07-09  2:20 UTC (permalink / raw)
  To: Richard Biener; +Cc: Alan Lawrence, gcc-patches, Jakub Jelinek

On 07/08/2015 03:43 AM, Richard Biener wrote:
> On Wed, Jul 8, 2015 at 12:07 AM, Jeff Law <law@redhat.com> wrote:
>> On 07/07/2015 06:37 AM, Alan Lawrence wrote:
>>>
>>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01346.html. Fixes
>>> FAIL of advsimd-intrinsics vcreate.c on aarch64_be-none-elf from
>>> previous patch.
>>>
>>> 15_native_interpret_real.patch
>>>
>>>
>>> commit e2e7ca148960a82fc88128820f17e7cbd14173cb
>>> Author: Alan Lawrence<alan.lawrence@arm.com>
>>> Date:   Thu Apr 9 10:54:40 2015 +0100
>>>
>>>       Fix native_interpret_real for HFmode floats on Bigendian with
>>> UNITS_PER_WORD>=4
>>>
>>>       (with missing space)
>>
>> OK with ChangeLog in proper form.
>
> Err - but now offset can become negative?  Shouldn't it rather error out
> before as it requires at least 4 bytes for big-endian?
I managed to convince myself the value wouldn't be negative when reviewing.

>
> That said - the whole thing looks it doesn't expect GET_MODE_SIZE < 4
> and your "fix" is just very obfuscated (if it really is a fix).
While I couldn't convince myself the function as a whole was prepared 
for smaller objects, I don't think Alan's patch made things worse.  One 
could argue the whole bloody thing ought to be rewritten though.

I'd also managed to convince myself the other instances of "3" weren't 
problematical.

jeff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-09  2:20       ` Jeff Law
@ 2015-07-09  9:34         ` Alan Lawrence
  2015-07-09  9:48           ` Richard Biener
  0 siblings, 1 reply; 35+ messages in thread
From: Alan Lawrence @ 2015-07-09  9:34 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, gcc-patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 2207 bytes --]

Jeff Law wrote:
> On 07/08/2015 03:43 AM, Richard Biener wrote:
>> On Wed, Jul 8, 2015 at 12:07 AM, Jeff Law <law@redhat.com> wrote:
>>> On 07/07/2015 06:37 AM, Alan Lawrence wrote:
>>>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01346.html. Fixes
>>>> FAIL of advsimd-intrinsics vcreate.c on aarch64_be-none-elf from
>>>> previous patch.
>>>>
>>>> 15_native_interpret_real.patch
>>>>
>>>>
>>>> commit e2e7ca148960a82fc88128820f17e7cbd14173cb
>>>> Author: Alan Lawrence<alan.lawrence@arm.com>
>>>> Date:   Thu Apr 9 10:54:40 2015 +0100
>>>>
>>>>       Fix native_interpret_real for HFmode floats on Bigendian with
>>>> UNITS_PER_WORD>=4
>>>>
>>>>       (with missing space)
>>> OK with ChangeLog in proper form.
>> Err - but now offset can become negative?  Shouldn't it rather error out
>> before as it requires at least 4 bytes for big-endian?
> I managed to convince myself the value wouldn't be negative when reviewing.
> 
>> That said - the whole thing looks it doesn't expect GET_MODE_SIZE < 4
>> and your "fix" is just very obfuscated (if it really is a fix).
> While I couldn't convince myself the function as a whole was prepared 
> for smaller objects, I don't think Alan's patch made things worse.  One 
> could argue the whole bloody thing ought to be rewritten though.
> 
> I'd also managed to convince myself the other instances of "3" weren't 
> problematical.
> 
> jeff
> 

In response to Richard's comments, may I propose the attached patch instead?

I used the term "long" because of the earlier comment:
   /* There are always 32 bits in each long, no matter the size of
      the hosts long.  We handle floating point representations with
      up to 192 bits.  */
with which I really don't think I want to mess at this point ;). However, I'd be 
happy to change my use of "long" to "32 bits", "4 bytes", or "group of 4" 
instead if one of those was preferable!

Bootstrapped + check-gcc on x86_64 (no change); cross-tested on 
aarch64_be-none-elf (where it fixes the advsimd-intrinsics float16 test)

gcc/ChangeLog (as before):

	* fold-const.c (native_interpret_real): Fix HFmode for bigendian where
	UNITS_PER_WORD >= 4.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 15_native_interpret_real_v2.patch --]
[-- Type: text/x-patch; name=15_native_interpret_real_v2.patch, Size: 2129 bytes --]

diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index e61d9463462746f45c96a0e7a154fb45ed026f43..518780e6cd2724002f0c7a805aa7742b6374600c 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -7592,7 +7592,6 @@ native_interpret_real (tree type, const unsigned char *ptr, int len)
 {
   machine_mode mode = TYPE_MODE (type);
   int total_bytes = GET_MODE_SIZE (mode);
-  int byte, offset, word, words, bitpos;
   unsigned char value;
   /* There are always 32 bits in each long, no matter the size of
      the hosts long.  We handle floating point representations with
@@ -7603,16 +7602,18 @@ native_interpret_real (tree type, const unsigned char *ptr, int len)
   total_bytes = GET_MODE_SIZE (TYPE_MODE (type));
   if (total_bytes > len || total_bytes > 24)
     return NULL_TREE;
-  words = (32 / BITS_PER_UNIT) / UNITS_PER_WORD;
+  int words = (32 / BITS_PER_UNIT) / UNITS_PER_WORD;
 
   memset (tmp, 0, sizeof (tmp));
-  for (bitpos = 0; bitpos < total_bytes * BITS_PER_UNIT;
+  for (int bitpos = 0; bitpos < total_bytes * BITS_PER_UNIT;
        bitpos += BITS_PER_UNIT)
     {
-      byte = (bitpos / BITS_PER_UNIT) & 3;
+      /* Both OFFSET and BYTE index within a long;
+	 bitpos indexes the whole float.  */
+      int offset, byte = (bitpos / BITS_PER_UNIT) & 3;
       if (UNITS_PER_WORD < 4)
 	{
-	  word = byte / UNITS_PER_WORD;
+	  int word = byte / UNITS_PER_WORD;
 	  if (WORDS_BIG_ENDIAN)
 	    word = (words - 1) - word;
 	  offset = word * UNITS_PER_WORD;
@@ -7622,7 +7623,16 @@ native_interpret_real (tree type, const unsigned char *ptr, int len)
 	    offset += byte % UNITS_PER_WORD;
 	}
       else
-	offset = BYTES_BIG_ENDIAN ? 3 - byte : byte;
+	{
+	  offset = byte;
+	  if (BYTES_BIG_ENDIAN)
+	    {
+	      /* Reverse bytes within each long, or within the entire float
+		 if it's smaller than a long (for HFmode).  */
+	      offset = MIN (3, total_bytes - 1) - offset;
+	      gcc_assert (offset >= 0);
+	    }
+	}
       value = ptr[offset + ((bitpos / BITS_PER_UNIT) & ~3)];
 
       tmp[bitpos / 32] |= (unsigned long)value << (bitpos & 31);

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-09  9:34         ` Alan Lawrence
@ 2015-07-09  9:48           ` Richard Biener
  2015-07-09  9:56             ` Alan Lawrence
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Biener @ 2015-07-09  9:48 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: Jeff Law, gcc-patches, Jakub Jelinek

On Thu, Jul 9, 2015 at 11:34 AM, Alan Lawrence <alan.lawrence@arm.com> wrote:
> Jeff Law wrote:
>>
>> On 07/08/2015 03:43 AM, Richard Biener wrote:
>>>
>>> On Wed, Jul 8, 2015 at 12:07 AM, Jeff Law <law@redhat.com> wrote:
>>>>
>>>> On 07/07/2015 06:37 AM, Alan Lawrence wrote:
>>>>>
>>>>> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01346.html. Fixes
>>>>> FAIL of advsimd-intrinsics vcreate.c on aarch64_be-none-elf from
>>>>> previous patch.
>>>>>
>>>>> 15_native_interpret_real.patch
>>>>>
>>>>>
>>>>> commit e2e7ca148960a82fc88128820f17e7cbd14173cb
>>>>> Author: Alan Lawrence<alan.lawrence@arm.com>
>>>>> Date:   Thu Apr 9 10:54:40 2015 +0100
>>>>>
>>>>>       Fix native_interpret_real for HFmode floats on Bigendian with
>>>>> UNITS_PER_WORD>=4
>>>>>
>>>>>       (with missing space)
>>>>
>>>> OK with ChangeLog in proper form.
>>>
>>> Err - but now offset can become negative?  Shouldn't it rather error out
>>> before as it requires at least 4 bytes for big-endian?
>>
>> I managed to convince myself the value wouldn't be negative when
>> reviewing.
>>
>>> That said - the whole thing looks it doesn't expect GET_MODE_SIZE < 4
>>> and your "fix" is just very obfuscated (if it really is a fix).
>>
>> While I couldn't convince myself the function as a whole was prepared for
>> smaller objects, I don't think Alan's patch made things worse.  One could
>> argue the whole bloody thing ought to be rewritten though.
>>
>> I'd also managed to convince myself the other instances of "3" weren't
>> problematical.
>>
>> jeff
>>
>
> In response to Richard's comments, may I propose the attached patch instead?
>
> I used the term "long" because of the earlier comment:
>   /* There are always 32 bits in each long, no matter the size of
>      the hosts long.  We handle floating point representations with
>      up to 192 bits.  */
> with which I really don't think I want to mess at this point ;). However,
> I'd be happy to change my use of "long" to "32 bits", "4 bytes", or "group
> of 4" instead if one of those was preferable!
>
> Bootstrapped + check-gcc on x86_64 (no change); cross-tested on
> aarch64_be-none-elf (where it fixes the advsimd-intrinsics float16 test)
>
> gcc/ChangeLog (as before):
>
>         * fold-const.c (native_interpret_real): Fix HFmode for bigendian
> where
>         UNITS_PER_WORD >= 4.

Looks good to me.  I think you can remove the assert, indeed offset
cannot become negative (but the compiler can't see that).

I wonder why wi::from_buffer doesn't have the same issue though
for HImode ints.  It's structured differently, without magic '4's as well.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real
  2015-07-09  9:48           ` Richard Biener
@ 2015-07-09  9:56             ` Alan Lawrence
  0 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-09  9:56 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Jakub Jelinek

Richard Biener wrote:
>
> I wonder why wi::from_buffer doesn't have the same issue though
> for HImode ints.  It's structured differently, without magic '4's as well.

I don't claim to understand the rest of wi::from_buffer and why it is different. 
However, wrt. HImode, I think the key line is:

  offset = BYTES_BIG_ENDIAN ? (buffer_len - 1) - byte : byte;

HTH,
Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/16][ARM] Add float16x4_t intrinsics
  2015-07-08  8:35             ` Ramana Radhakrishnan
@ 2015-07-27 13:22               ` Alan Lawrence
  0 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-27 13:22 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: Kyrylo Tkachov, gcc-patches, Ramana Radhakrishnan, Tejas Belagod,
	Richard Earnshaw

Ramana Radhakrishnan wrote:
> I haven't seen the patch yet but here are my thoughts on where this should be going.
> 
> Thus in summary - 
> 
> 1. -mfpu=neon implies the presence of the float16x(4/8) types and all the intrinsics that treat these values as bags of bits.
> 2. -mfpu=neon-fp16 implies the presence of the vcvt* intrinsics that are needed for the float16 types.

So I think the "problems" are statements in ACLE that

(a) we should only have float16x(4/8)_t types, when we have scalar types as well;

(b) whenever we have a scalar __fp16 type, we should have one or other of the 
__ARM_FP16_FORMAT_(IEEE/ALTERNATIVE) macros defined to indicate the format 
that's in use.

Sadly these seem to forbid the current situation whereby we expose hardware 
conversion instructions (that work with either fp16 format, according to the 
status of the FPSCR bit) and allow compiling a binary that will work with either 
format :(.

The situation is further complicated by GCC's support for the alternative format 
(not mandated by ACLE), that we can support either format in the absence of any 
hardware (as we have software emulation routines for scalar conversions in 
either format, as long as we know which at compile time), and that object files 
compiled with different -mfp16-format cannot be linked together (the ABI 
attributes conflict).

However, I think we can still go with Ramana's point 1. albeit _only_when_ a 
-mfp16-format is specified. _2_ similarly (i.e. -mfpu=neon-fp16 will not provide 
any additional intrinsics unless an -mfp16-format is specified).

I'll repost the patch series shortly with those changes implemented. In the 
meantime: are patches 1 & 2 ( ARM __builtin_arm_neon_lane_bounds and 
qualifier_lane_index) OK to commit? These contain nothing float16-specific, and 
would unblock Charles Baylis' work on PR63870 
(https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00545.html). I'll ping the 
AArch64 changes separately.

Cheers, Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/16][ARM] PR/63870 Add __builtin_arm_lane_check.
  2015-07-07 12:34 ` [PATCH 2/16][ARM] PR/63870 Add __builtin_arm_lane_check Alan Lawrence
@ 2015-07-27 14:30   ` Kyrill Tkachov
  2015-07-27 16:01     ` Alan Lawrence
  0 siblings, 1 reply; 35+ messages in thread
From: Kyrill Tkachov @ 2015-07-27 14:30 UTC (permalink / raw)
  To: Alan Lawrence, gcc-patches

Hi Alan,

On 07/07/15 13:34, Alan Lawrence wrote:
> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01334.html

+  if (fcode == ARM_BUILTIN_NEON_LANE_CHECK)
+    {
+      tree nlanes = CALL_EXPR_ARG (exp, 0);
+      gcc_assert (TREE_CODE (nlanes) == INTEGER_CST);
+      rtx lane_idx = expand_normal (CALL_EXPR_ARG (exp, 1));
+      if (CONST_INT_P (lane_idx))
+	neon_lane_bounds (lane_idx, 0, TREE_INT_CST_LOW (nlanes), exp);
+      else
+	error ("%Klane index must be a constant immediate", exp);
+      /* Don't generate any RTL.  */
+      return const0_rtx;
+    }

Can you please add a comment on top of this saying that this builtin only exists to perform the
lane check, just to make it explicit for the future.

Ok with a comment added.
Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/16][ARM] PR/63870 Add qualifier to check lane bounds in expand
  2015-07-07 12:34 ` [PATCH 1/16][ARM] PR/63870 Add qualifier to check lane bounds in expand Alan Lawrence
@ 2015-07-27 14:33   ` Kyrill Tkachov
  0 siblings, 0 replies; 35+ messages in thread
From: Kyrill Tkachov @ 2015-07-27 14:33 UTC (permalink / raw)
  To: Alan Lawrence, gcc-patches

Hi Alan,

On 07/07/15 13:34, Alan Lawrence wrote:
> As per https://gcc.gnu.org/ml/gcc-patches/2015-04/msg01333.html
>
> (While this falls under PR/63870, and I will link to that in the ChangeLog, it
> is only a small step towards fixing that PR.)

This is ok for trunk.
Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/16][ARM] PR/63870 Add __builtin_arm_lane_check.
  2015-07-27 14:30   ` Kyrill Tkachov
@ 2015-07-27 16:01     ` Alan Lawrence
  0 siblings, 0 replies; 35+ messages in thread
From: Alan Lawrence @ 2015-07-27 16:01 UTC (permalink / raw)
  To: Kyrill Tkachov; +Cc: gcc-patches, Charles Baylis

Kyrill Tkachov wrote:
> Hi Alan,
> 
> Can you please add a comment on top of this saying that this builtin only exists to perform the
> lane check, just to make it explicit for the future.

Done, and pushed as r226252.

Charles, thanks for your patience, and I hope this lets you move forwards. I 
realize this leaves you slightly "picking up the pieces" in terms of what's 
there and what isn't, but in theory it corresponds to what we've done previously 
in AArch64 - give me a shout if you are unclear how to proceed!

Thanks, Alan

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2015-07-27 15:57 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-07 12:32 [PATCH 0/16][ARM/AArch64] Float16_t support, v2 Alan Lawrence
2015-07-07 12:34 ` [PATCH 3/16][ARM] Add float16x4_t intrinsics Alan Lawrence
2015-07-07 13:09   ` Kyrill Tkachov
2015-07-07 16:22     ` Kyrill Tkachov
2015-07-07 16:34       ` Alan Lawrence
2015-07-07 16:52         ` Kyrill Tkachov
2015-07-07 17:17           ` Alan Lawrence
2015-07-08  8:35             ` Ramana Radhakrishnan
2015-07-27 13:22               ` Alan Lawrence
2015-07-07 12:34 ` [PATCH 4/16][ARM] Add float16x8_t type Alan Lawrence
2015-07-07 12:34 ` [PATCH 2/16][ARM] PR/63870 Add __builtin_arm_lane_check Alan Lawrence
2015-07-27 14:30   ` Kyrill Tkachov
2015-07-27 16:01     ` Alan Lawrence
2015-07-07 12:34 ` [PATCH 1/16][ARM] PR/63870 Add qualifier to check lane bounds in expand Alan Lawrence
2015-07-27 14:33   ` Kyrill Tkachov
2015-07-07 12:35 ` [PATCH 7/16][AArch64] Add basic fp16 support Alan Lawrence
2015-07-07 12:35 ` [PATCH 5/16][ARM] Add float16x8_t intrinsics Alan Lawrence
2015-07-07 12:35 ` [PATCH 6/16][ARM] Remaining float16 intrinsics: vld..., vst..., vget_low/high, vcombine Alan Lawrence
2015-07-07 12:36 ` [PATCH 9/16][AArch64] Add support for float16x{4,8}_t vectors/builtins Alan Lawrence
2015-07-07 12:36 ` [PATCH 10/16][AArch64] vld{2,3,4}{,_lane,_dup},vcombine,vcreate Alan Lawrence
2015-07-07 12:36 ` [PATCH 8/16][ARM/AArch64 Testsuite] Add basic fp16 tests Alan Lawrence
2015-07-07 12:36 ` [PATCH 11/16][AArch64] Implement vcvt_{,high_}f16_f32 Alan Lawrence
2015-07-07 12:37 ` [PATCH 12/16][AArch64] vreinterpret(q?), vget_(low|high), vld1(q?)_dup Alan Lawrence
2015-07-07 12:37 ` [PATCH 15/16][fold-const.c] Fix bigendian HFmode in native_interpret_real Alan Lawrence
2015-07-07 22:06   ` Jeff Law
2015-07-08  9:43     ` Richard Biener
2015-07-08 10:51       ` Alan Lawrence
2015-07-08 12:44         ` Richard Biener
2015-07-09  2:20       ` Jeff Law
2015-07-09  9:34         ` Alan Lawrence
2015-07-09  9:48           ` Richard Biener
2015-07-09  9:56             ` Alan Lawrence
2015-07-07 12:37 ` [PATCH 13/16][AArch64] Add vcvt(_high)?_f32_f16 intrinsics, with BE RTL fix Alan Lawrence
2015-07-07 12:38 ` [PATCH 14/16][ARM/AArch64 testsuite] Update advsimd-intrinsics tests to add float16 vectors Alan Lawrence
2015-07-07 12:39 ` [PATCH 16/16][ARM/AArch64 Testsuite] Add test of vcvt{,_high}_{f16_f32,f32_f16} Alan Lawrence

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).