[PATCH] ARM PR68620 (ICE with FP16 on armeb)

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] ARM PR68620 (ICE with FP16 on armeb)
@ 2016-01-15 10:39 Christophe Lyon
  2016-01-18 19:02 ` Alan Lawrence
  0 siblings, 1 reply; 9+ messages in thread
From: Christophe Lyon @ 2016-01-15 10:39 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1388 bytes --]

Hi,

The attached patch fixes PR68620.

It wasn't sufficient to add the movv4hf pattern, because this also
enabled other transformations, and I had to update existing support
such that the tests continue to pass after using new code paths.

I added V4HF/V8HF to the VQXMOV and VDQ iterators to enable the use of
these modes in the relevant patterns.

For the vec_set<mode>_internal and neon_vld1_dup<mode> patterns, I
switched to an existing iterator which already had the needed
V4HF/V8HF (so I switched to VD_LANE and VQ2).

For neon_vdupn, I chose to implement neon_vdup_nv4hf and
neon_vdup_nv8hf instead of updating the VX iterator because I thought
it was not desirable to impact neon_vrev32<mode>.

I had to update neon_valid_immediate to return -1 when handling FP16
immediates (they cannot be represented in neon mov instructions).

Finally, I had to adjust the vget_lane_f16/vset_lane_f16
implementations in arm_neon.h to account for the different lane
numbering in big-endian. This has the benefit of making
vldX_lane_f16_indices_1
vstX_lane_f16_indices_1.c
vcvt_f16.c
vcvtf16_f32.c
now pass on armeb.

Regarding the testsuite, I've added the testcase that would otherwise
ICE, and the arm_fp effective target I've also proposed in my other
testsuite patch related to target attributes.

I've tested this patch using QEMU on arm-linux and armeb-linux targets.

OK?

Christophe.

[-- Attachment #2: pr68620-v8.log.txt --]
[-- Type: text/plain, Size: 1018 bytes --]

gcc/ChangeLog:

2016-01-15  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/68620
	* config/arm/arm.c (neon_valid_immediate): Handle FP16 vectors.
	* config/arm/arm_neon.h (__arm_lane): New helper macro.
	(vget_lane_f16): Handle big-endian.
	(vgetq_lane_f16): Likewise.
	(vset_lane_f16): Likewise.
	(vsetq_lane_f16): Likewise.
	* config/arm/iterators.md (VQXMOV): Add V8HF.
	(VDQ): Add V4HF and V8HF.
	(V_reg): Handle V4HF and V8HF.
	(Is_float_mode): Likewise.
	* config/arm/neon.md (movv4hf, movv8hf, neon_vdup_nv4hf,
	neon_vdup_nv8hf): New patterns.
	(vec_set<mode>_internal, neon_vld1_dup<mode>): Use VD_LANE iterator.
	(neon_vld1_dup<mode>): Use VQ2 iterator.
	* doc/sourcebuild.texi (arm_fp_ok, arm_fp): Add documentation.

gcc/testsuite/ChangeLog:

2016-01-15  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/68620
	* gcc.target/arm/pr68620.c: New test.
	* lib/target-supports.exp
	(check_effective_target_arm_fp_ok_nocache)
	(check_effective_target_arm_fp_ok, add_options_for_arm_fp): New.


[-- Attachment #3: pr68620-v8.patch.txt --]
[-- Type: text/plain, Size: 10499 bytes --]

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3588b83..b1f408c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12370,6 +12370,10 @@ neon_valid_immediate (rtx op, machine_mode mode, int inverse,
       if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
         return -1;
 
+      /* FP16 vectors cannot be represented.  */
+      if (innersize == 2)
+	return -1;
+
       r0 = CONST_DOUBLE_REAL_VALUE (el0);
 
       for (i = 1; i < n_elts; i++)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 0a33d21..b4aabd9 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -5252,12 +5252,22 @@ vget_lane_s32 (int32x2_t __a, const int __b)
    were marked always-inline so there were no call sites, the declaration
    would nonetheless raise an error.  Hence, we must use a macro instead.  */
 
+  /* For big-endian, GCC's vector indices are the opposite way around
+     to the architectural lane indices used by Neon intrinsics.  */
+#ifdef __ARM_BIG_ENDIAN
+  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
+     right value for vectors with 8 lanes.  */
+#define __arm_lane(__vec, __idx) (__idx ^ 3)
+#else
+#define __arm_lane(__vec, __idx) __idx
+#endif
+
 #define vget_lane_f16(__v, __idx)		\
   __extension__					\
     ({						\
       float16x4_t __vec = (__v);		\
       __builtin_arm_lane_check (4, __idx);	\
-      float16_t __res = __vec[__idx];		\
+      float16_t __res = __vec[__arm_lane(__vec, __idx)];	\
       __res;					\
     })
 #endif
@@ -5334,7 +5344,7 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
     ({						\
       float16x8_t __vec = (__v);		\
       __builtin_arm_lane_check (8, __idx);	\
-      float16_t __res = __vec[__idx];		\
+      float16_t __res = __vec[__arm_lane(__vec, __idx)];	\
       __res;					\
     })
 #endif
@@ -5412,7 +5422,7 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
       float16_t __elem = (__e);			\
       float16x4_t __vec = (__v);		\
       __builtin_arm_lane_check (4, __idx);	\
-      __vec[__idx] = __elem;			\
+      __vec[__arm_lane (__vec, __idx)] = __elem;		       \
       __vec;					\
     })
 #endif
@@ -5490,7 +5500,7 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
       float16_t __elem = (__e);			\
       float16x8_t __vec = (__v);		\
       __builtin_arm_lane_check (8, __idx);	\
-      __vec[__idx] = __elem;			\
+      __vec[__arm_lane (__vec, __idx)] = __elem;	       \
       __vec;					\
     })
 #endif
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 6a54125..88e1c3d 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -99,7 +99,7 @@
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
 ;; Quad-width vector modes, with TImode added, for moves.
-(define_mode_iterator VQXMOV [V16QI V8HI V4SI V4SF V2DI TI])
+(define_mode_iterator VQXMOV [V16QI V8HI V8HF V4SI V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
 (define_mode_iterator VSTRUCT [EI OI CI XI])
@@ -114,7 +114,7 @@
 (define_mode_iterator VN [V8HI V4SI V2DI])
 
 ;; All supported vector modes (except singleton DImode).
-(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI])
+(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF V2DI])
 
 ;; All supported vector modes (except those with 64-bit integer elements).
 (define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
@@ -424,6 +424,7 @@
 ;; Register width from element mode
 (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
                          (V4HI "P") (V8HI  "q")
+                         (V4HF "P") (V8HF  "q")
                          (V2SI "P") (V4SI  "q")
                          (V2SF "P") (V4SF  "q")
                          (DI   "P") (V2DI  "q")
@@ -572,6 +573,7 @@
 (define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
                  (V4HI "false") (V8HI "false")
                  (V2SI "false") (V4SI "false")
+                 (V4HF "true") (V8HF "true")
                  (V2SF "true") (V4SF "true")
                  (DI "false") (V2DI "false")])
 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 62fb6da..9e04e5c 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -137,6 +137,30 @@
     }
 })
 
+(define_expand "movv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand")
+	(match_operand:V4HF 1 "s_register_operand"))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+	operands[1] = force_reg (V4HFmode, operands[1]);
+    }
+})
+
+(define_expand "movv8hf"
+  [(set (match_operand:V8HF 0 "")
+	(match_operand:V8HF 1 ""))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+	operands[1] = force_reg (V8HFmode, operands[1]);
+    }
+})
+
 (define_insn "*neon_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
 	(match_operand:VSTRUCT 1 "general_operand"	" w,w, Ut"))]
@@ -299,11 +323,11 @@
   [(set_attr "type" "neon_load1_1reg<q>")])
 
 (define_insn "vec_set<mode>_internal"
-  [(set (match_operand:VD 0 "s_register_operand" "=w,w")
-        (vec_merge:VD
-          (vec_duplicate:VD
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w,w")
+        (vec_merge:VD_LANE
+          (vec_duplicate:VD_LANE
             (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r"))
-          (match_operand:VD 3 "s_register_operand" "0,0")
+          (match_operand:VD_LANE 3 "s_register_operand" "0,0")
           (match_operand:SI 2 "immediate_operand" "i,i")))]
   "TARGET_NEON"
 {
@@ -2806,6 +2830,22 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_from_gp<q>")]
 )
 
+(define_insn "neon_vdup_nv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V4HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%P0, %1"
+  [(set_attr "type" "neon_from_gp")]
+)
+
+(define_insn "neon_vdup_nv8hf"
+  [(set (match_operand:V8HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V8HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%q0, %1"
+  [(set_attr "type" "neon_from_gp_q")]
+)
+
 (define_insn "neon_vdup_n<mode>"
   [(set (match_operand:V32 0 "s_register_operand" "=w,w")
         (vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))]
@@ -4305,8 +4345,8 @@ if (BYTES_BIG_ENDIAN)
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VD 0 "s_register_operand" "=w")
-        (vec_duplicate:VD (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w")
+        (vec_duplicate:VD_LANE (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
   "TARGET_NEON"
   "vld1.<V_sz_elem>\t{%P0[]}, %A1"
   [(set_attr "type" "neon_load1_all_lanes<q>")]
@@ -4322,8 +4362,8 @@ if (BYTES_BIG_ENDIAN)
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VQ 0 "s_register_operand" "=w")
-        (vec_duplicate:VQ (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+  [(set (match_operand:VQ2 0 "s_register_operand" "=w")
+        (vec_duplicate:VQ2 (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
   "TARGET_NEON"
 {
   return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, %A1";
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 61de4a5..3f2e0e3 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1514,6 +1514,12 @@ ARM target generates 32-bit code.
 @item arm_eabi
 ARM target adheres to the ABI for the ARM Architecture.
 
+@item arm_fp_ok
+@anchor{arm_fp_ok}
+ARM target defines @code{__ARM_FP} using @code{-mfloat-abi=softfp} or
+equivalent options.  Some multilibs may be incompatible with these
+options.
+
 @item arm_hf_eabi
 ARM target adheres to the VFP and Advanced SIMD Register Arguments
 variant of the ABI for the ARM Architecture (as selected with
@@ -1527,6 +1533,11 @@ Some multilibs may be incompatible with these options.
 ARM target supports @code{-mcpu=iwmmxt}.
 Some multilibs may be incompatible with this option.
 
+@item arm_fp
+@code{__ARM_FP} definition.  Only ARM targets support this feature, and only then
+in certain modes; see the @ref{arm_fp_ok,,arm_fp_ok effective target
+keyword}.
+
 @item arm_neon
 ARM target supports generating NEON instructions.
 
diff --git a/gcc/testsuite/gcc.target/arm/pr68620.c b/gcc/testsuite/gcc.target/arm/pr68620.c
new file mode 100644
index 0000000..984992f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr68620.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-options "-mfp16-format=ieee" } */
+/* { dg-add-options arm_fp } */
+
+#include "arm_neon.h"
+
+float16x4_t __attribute__((target("fpu=neon-fp16")))
+foo (float32x4_t arg)
+{
+    return vcvt_f16_f32 (arg);
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 4e349e9..228e68d 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2721,6 +2721,46 @@ proc check_effective_target_arm_hard_vfp_ok { } {
     }
 }
 
+# Return 1 if this is an ARM target defining __ARM_FP. We may need
+# -mfloat-abi=softfp or equivalent options.  Some multilibs may be
+# incompatible with these options.  Also set et_arm_fp_flags to the
+# best options to add.
+
+proc check_effective_target_arm_fp_ok_nocache { } {
+    global et_arm_fp_flags
+    set et_arm_fp_flags ""
+    if { [check_effective_target_arm32] } {
+	foreach flags {"" "-mfloat-abi=softfp" "-mfloat-abi=hard"} {
+	    if { [check_no_compiler_messages_nocache arm_fp_ok object {
+		#ifndef __ARM_FP
+		#error __ARM_FP not defined
+		#endif
+	    } "$flags"] } {
+		set et_arm_fp_flags $flags
+		return 1
+	    }
+	}
+    }
+    return 0
+}
+
+proc check_effective_target_arm_fp_ok { } {
+    return [check_cached_effective_target arm_fp_ok \
+		check_effective_target_arm_fp_ok_nocache]
+}
+
+# Add the options needed to define __ARM_FP.  We need either
+# -mfloat-abi=softfp or -mfloat-abi=hard, but if one is already
+# specified by the multilib, use it.
+
+proc add_options_for_arm_fp { flags } {
+    if { ! [check_effective_target_arm_fp_ok] } {
+	return "$flags"
+    }
+    global et_arm_fp_flags
+    return "$flags $et_arm_fp_flags"
+}
+
 # Return 1 if this is an ARM target that supports DSP multiply with
 # current multilib flags.
 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-15 10:39 [PATCH] ARM PR68620 (ICE with FP16 on armeb) Christophe Lyon
@ 2016-01-18 19:02 ` Alan Lawrence
  2016-01-19 11:15   ` Christophe Lyon
  0 siblings, 1 reply; 9+ messages in thread
From: Alan Lawrence @ 2016-01-18 19:02 UTC (permalink / raw)
  To: christophe.lyon; +Cc: gcc-patches

Thanks for working on this, Christophe, and sorry I missed the PR. You got
further in fixing more things than I did though :). A couple of comments:

> For the vec_set<mode>_internal and neon_vld1_dup<mode> patterns, I
> switched to an existing iterator which already had the needed
> V4HF/V8HF (so I switched to VD_LANE and VQ2).

It's a separate issue, and I hadn't done this either, but looking again - I
don't see any reason why we shouldn't apply VD->VD_LANE to the vec_extract
standard name pattern too. (At present looks like we have vec_extractv8hf but no
vec_extractv4hf ?)

> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
> neon_vdup_nv8hf instead of updating the VX iterator because I thought
> it was not desirable to impact neon_vrev32<mode>.

Well, the same instruction will suffice for vrev32'ing vectors of HF just as
well as vectors of HI, so I think I'd argue that's harmless enough. To gain the
benefit, we'd need to update arm_evpc_neon_vrev with a few new cases, though.

> @@ -5252,12 +5252,22 @@ vget_lane_s32 (int32x2_t __a, const int __b)
>     were marked always-inline so there were no call sites, the declaration
>     would nonetheless raise an error.  Hence, we must use a macro instead.  */
>
> +  /* For big-endian, GCC's vector indices are the opposite way around
> +     to the architectural lane indices used by Neon intrinsics.  */

Not quite the opposite way around, as you take into account yourself! 'Reversed
within each 64 bits', perhaps?

> +#ifdef __ARM_BIG_ENDIAN
> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
> +     right value for vectors with 8 lanes.  */
> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
> +#else
> +#define __arm_lane(__vec, __idx) __idx
> +#endif
> +

Looks right, but sounds... my concern here is that I'm hoping at some point we
will move the *other* vget/set_lane intrinsics to use GCC vector extensions
too. At which time (unlike __aarch64_lane which can be used everywhere) this
will be the wrong formula. Can we name (and/or comment) it to avoid misleading
anyone? The key characteristic seems to be that it is for vectors of 16-bit
elements only.

> @@ -5334,7 +5344,7 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
>      ({						\
>        float16x8_t __vec = (__v);		\
>        __builtin_arm_lane_check (8, __idx);	\
> -      float16_t __res = __vec[__idx];		\
> +      float16_t __res = __vec[__arm_lane(__vec, __idx)];	\

In passing - the function name in the @@ header is of course misleading, this is #define vgetq_lane_f16 (and the later hunks)

Thanks, Alan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-18 19:02 ` Alan Lawrence
@ 2016-01-19 11:15   ` Christophe Lyon
  2016-01-19 14:51     ` Alan Lawrence
  0 siblings, 1 reply; 9+ messages in thread
From: Christophe Lyon @ 2016-01-19 11:15 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 18 January 2016 at 20:01, Alan Lawrence <alan.lawrence@arm.com> wrote:
> Thanks for working on this, Christophe, and sorry I missed the PR. You got
> further in fixing more things than I did though :). A couple of comments:
>
>> For the vec_set<mode>_internal and neon_vld1_dup<mode> patterns, I
>> switched to an existing iterator which already had the needed
>> V4HF/V8HF (so I switched to VD_LANE and VQ2).
>
> It's a separate issue, and I hadn't done this either, but looking again - I
> don't see any reason why we shouldn't apply VD->VD_LANE to the vec_extract
> standard name pattern too. (At present looks like we have vec_extractv8hf but no
> vec_extractv4hf ?)
>
OK, I'l add that to my patch

>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>> it was not desirable to impact neon_vrev32<mode>.
>
> Well, the same instruction will suffice for vrev32'ing vectors of HF just as
> well as vectors of HI, so I think I'd argue that's harmless enough. To gain the
> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases, though.
>
Since this is more intrusive, I'd rather leave that part for later. OK?

>> @@ -5252,12 +5252,22 @@ vget_lane_s32 (int32x2_t __a, const int __b)
>>     were marked always-inline so there were no call sites, the declaration
>>     would nonetheless raise an error.  Hence, we must use a macro instead.  */
>>
>> +  /* For big-endian, GCC's vector indices are the opposite way around
>> +     to the architectural lane indices used by Neon intrinsics.  */
>
> Not quite the opposite way around, as you take into account yourself! 'Reversed
> within each 64 bits', perhaps?
>
OK, I'll try to rephrase that.

>> +#ifdef __ARM_BIG_ENDIAN
>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
>> +     right value for vectors with 8 lanes.  */
>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>> +#else
>> +#define __arm_lane(__vec, __idx) __idx
>> +#endif
>> +
>
> Looks right, but sounds... my concern here is that I'm hoping at some point we
> will move the *other* vget/set_lane intrinsics to use GCC vector extensions
> too. At which time (unlike __aarch64_lane which can be used everywhere) this
> will be the wrong formula. Can we name (and/or comment) it to avoid misleading
> anyone? The key characteristic seems to be that it is for vectors of 16-bit
> elements only.
>
I'm not to follow, here. Looking at the patterns for
neon_vget_lane<mode>_*internal in neon.md,
I can see 2 flavours: one for VD, one for VQ2. The latter uses "halfelts".

Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
that would be similar to the aarch64 ones (by computing the number of
lanes of the input vector), but the "q" one would use half the total
number of lanes instead?

>> @@ -5334,7 +5344,7 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
>>      ({                                               \
>>        float16x8_t __vec = (__v);             \
>>        __builtin_arm_lane_check (8, __idx);   \
>> -      float16_t __res = __vec[__idx];                \
>> +      float16_t __res = __vec[__arm_lane(__vec, __idx)];     \
>
> In passing - the function name in the @@ header is of course misleading, this is #define vgetq_lane_f16 (and the later hunks)
>
> Thanks, Alan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-19 11:15   ` Christophe Lyon
@ 2016-01-19 14:51     ` Alan Lawrence
  2016-01-20 21:10       ` Christophe Lyon
  0 siblings, 1 reply; 9+ messages in thread
From: Alan Lawrence @ 2016-01-19 14:51 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On 19/01/16 11:15, Christophe Lyon wrote:

>>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>>> it was not desirable to impact neon_vrev32<mode>.
>>
>> Well, the same instruction will suffice for vrev32'ing vectors of HF just as
>> well as vectors of HI, so I think I'd argue that's harmless enough. To gain the
>> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases, though.
>>
> Since this is more intrusive, I'd rather leave that part for later. OK?

Sure.

>>> +#ifdef __ARM_BIG_ENDIAN
>>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
>>> +     right value for vectors with 8 lanes.  */
>>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>>> +#else
>>> +#define __arm_lane(__vec, __idx) __idx
>>> +#endif
>>> +
>>
>> Looks right, but sounds... my concern here is that I'm hoping at some point we
>> will move the *other* vget/set_lane intrinsics to use GCC vector extensions
>> too. At which time (unlike __aarch64_lane which can be used everywhere) this
>> will be the wrong formula. Can we name (and/or comment) it to avoid misleading
>> anyone? The key characteristic seems to be that it is for vectors of 16-bit
>> elements only.
>>
> I'm not to follow, here. Looking at the patterns for
> neon_vget_lane<mode>_*internal in neon.md,
> I can see 2 flavours: one for VD, one for VQ2. The latter uses "halfelts".
>
> Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
> that would be similar to the aarch64 ones (by computing the number of
> lanes of the input vector), but the "q" one would use half the total
> number of lanes instead?

That works for me! Sthg like:

#define __arm_lane(__vec, __idx) NUM_LANES(__vec) - __idx
#define __arm_laneq(__vec, __idx) (__idx & (NUM_LANES(__vec)/2)) + 
(NUM_LANES(__vec)/2 - __idx)
//or similarly
#define __arm_laneq(__vec, __idx) (__idx ^ (NUM_LANES(__vec)/2 - 1))

Alternatively I'd been thinking

#define __arm_lane_32xN(__idx) __idx ^ 1
#define __arm_lane_16xN(__idx) __idx ^ 3
#define __arm_lane_8xN(__idx) __idx ^ 7

Bear in mind PR64893 that we had on AArch64 :-(

Cheers, Alan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-19 14:51     ` Alan Lawrence
@ 2016-01-20 21:10       ` Christophe Lyon
  2016-01-22 17:07         ` Alan Lawrence
  2016-01-26 13:20         ` Kyrill Tkachov
  0 siblings, 2 replies; 9+ messages in thread
From: Christophe Lyon @ 2016-01-20 21:10 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2567 bytes --]

On 19 January 2016 at 15:51, Alan Lawrence <alan.lawrence@foss.arm.com> wrote:
> On 19/01/16 11:15, Christophe Lyon wrote:
>
>>>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>>>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>>>> it was not desirable to impact neon_vrev32<mode>.
>>>
>>>
>>> Well, the same instruction will suffice for vrev32'ing vectors of HF just
>>> as
>>> well as vectors of HI, so I think I'd argue that's harmless enough. To
>>> gain the
>>> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases,
>>> though.
>>>
>> Since this is more intrusive, I'd rather leave that part for later. OK?
>
>
> Sure.
>
>>>> +#ifdef __ARM_BIG_ENDIAN
>>>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
>>>> +     right value for vectors with 8 lanes.  */
>>>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>>>> +#else
>>>> +#define __arm_lane(__vec, __idx) __idx
>>>> +#endif
>>>> +
>>>
>>>
>>> Looks right, but sounds... my concern here is that I'm hoping at some
>>> point we
>>> will move the *other* vget/set_lane intrinsics to use GCC vector
>>> extensions
>>> too. At which time (unlike __aarch64_lane which can be used everywhere)
>>> this
>>> will be the wrong formula. Can we name (and/or comment) it to avoid
>>> misleading
>>> anyone? The key characteristic seems to be that it is for vectors of
>>> 16-bit
>>> elements only.
>>>
>> I'm not to follow, here. Looking at the patterns for
>> neon_vget_lane<mode>_*internal in neon.md,
>> I can see 2 flavours: one for VD, one for VQ2. The latter uses "halfelts".
>>
>> Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
>> that would be similar to the aarch64 ones (by computing the number of
>> lanes of the input vector), but the "q" one would use half the total
>> number of lanes instead?
>
>
> That works for me! Sthg like:
>
> #define __arm_lane(__vec, __idx) NUM_LANES(__vec) - __idx
> #define __arm_laneq(__vec, __idx) (__idx & (NUM_LANES(__vec)/2)) +
> (NUM_LANES(__vec)/2 - __idx)
> //or similarly
> #define __arm_laneq(__vec, __idx) (__idx ^ (NUM_LANES(__vec)/2 - 1))
>
> Alternatively I'd been thinking
>
> #define __arm_lane_32xN(__idx) __idx ^ 1
> #define __arm_lane_16xN(__idx) __idx ^ 3
> #define __arm_lane_8xN(__idx) __idx ^ 7
>
> Bear in mind PR64893 that we had on AArch64 :-(
>

Here is a new version, based on the comments above.
I've also removed the addition of arm_fp_ok effective target since I
added that in my other testsuite patch.

OK now?

Thanks,

Christophe

> Cheers, Alan

[-- Attachment #2: pr68620.log.txt --]
[-- Type: text/plain, Size: 866 bytes --]

gcc/ChangeLog:

2016-01-20  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/68620
	* config/arm/arm.c (neon_valid_immediate): Handle FP16 vectors.
	* config/arm/arm_neon.h (__ARM_NUM_LANES, __arm_lane, arm_lanq):
	New helper macros.
	(vget_lane_f16): Handle big-endian.
	(vgetq_lane_f16): Likewise.
	(vset_lane_f16): Likewise.
	(vsetq_lane_f16): Likewise.
	* config/arm/iterators.md (VQXMOV): Add V8HF.
	(VDQ): Add V4HF and V8HF.
	(V_reg): Handle V4HF and V8HF.
	(Is_float_mode): Likewise.
	* config/arm/neon.md (movv4hf, movv8hf, neon_vdup_nv4hf,
	neon_vdup_nv8hf): New patterns.
	(vec_set<mode>_internal, vec_extract<mode>, neon_vld1_dup<mode>):
	Use VD_LANE iterator.
	(neon_vld1_dup<mode>): Use VQ2 iterator.

gcc/testsuite/ChangeLog:

2016-01-20  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/68620
	* gcc.target/arm/pr68620.c: New test.


[-- Attachment #3: pr68620.txt --]
[-- Type: text/plain, Size: 9799 bytes --]

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3588b83..b1f408c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12370,6 +12370,10 @@ neon_valid_immediate (rtx op, machine_mode mode, int inverse,
       if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
         return -1;
 
+      /* FP16 vectors cannot be represented.  */
+      if (innersize == 2)
+	return -1;
+
       r0 = CONST_DOUBLE_REAL_VALUE (el0);
 
       for (i = 1; i < n_elts; i++)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 0a33d21..69b28c8 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -5252,14 +5252,26 @@ vget_lane_s32 (int32x2_t __a, const int __b)
    were marked always-inline so there were no call sites, the declaration
    would nonetheless raise an error.  Hence, we must use a macro instead.  */
 
-#define vget_lane_f16(__v, __idx)		\
-  __extension__					\
-    ({						\
-      float16x4_t __vec = (__v);		\
-      __builtin_arm_lane_check (4, __idx);	\
-      float16_t __res = __vec[__idx];		\
-      __res;					\
-    })
+  /* For big-endian, GCC's vector indices are reversed within each 64
+     bits compared to the architectural lane indices used by Neon
+     intrinsics.  */
+#ifdef __ARM_BIG_ENDIAN
+#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))
+#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1))
+#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1))
+#else
+#define __arm_lane(__vec, __idx) __idx
+#define __arm_laneq(__vec, __idx) __idx
+#endif
+
+#define vget_lane_f16(__v, __idx)			\
+  __extension__						\
+  ({							\
+    float16x4_t __vec = (__v);				\
+    __builtin_arm_lane_check (4, __idx);		\
+    float16_t __res = __vec[__arm_lane(__vec, __idx)];	\
+    __res;						\
+  })
 #endif
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
@@ -5329,14 +5341,14 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-#define vgetq_lane_f16(__v, __idx)		\
-  __extension__					\
-    ({						\
-      float16x8_t __vec = (__v);		\
-      __builtin_arm_lane_check (8, __idx);	\
-      float16_t __res = __vec[__idx];		\
-      __res;					\
-    })
+#define vgetq_lane_f16(__v, __idx)			\
+  __extension__						\
+  ({							\
+    float16x8_t __vec = (__v);				\
+    __builtin_arm_lane_check (8, __idx);		\
+    float16_t __res = __vec[__arm_laneq(__vec, __idx)];	\
+    __res;						\
+  })
 #endif
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
@@ -5408,13 +5420,13 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 #define vset_lane_f16(__e, __v, __idx)		\
   __extension__					\
-    ({						\
-      float16_t __elem = (__e);			\
-      float16x4_t __vec = (__v);		\
-      __builtin_arm_lane_check (4, __idx);	\
-      __vec[__idx] = __elem;			\
-      __vec;					\
-    })
+  ({						\
+    float16_t __elem = (__e);			\
+    float16x4_t __vec = (__v);			\
+    __builtin_arm_lane_check (4, __idx);	\
+    __vec[__arm_lane (__vec, __idx)] = __elem;	\
+    __vec;					\
+  })
 #endif
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -5486,13 +5498,13 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 #define vsetq_lane_f16(__e, __v, __idx)		\
   __extension__					\
-    ({						\
-      float16_t __elem = (__e);			\
-      float16x8_t __vec = (__v);		\
-      __builtin_arm_lane_check (8, __idx);	\
-      __vec[__idx] = __elem;			\
-      __vec;					\
-    })
+  ({						\
+    float16_t __elem = (__e);			\
+    float16x8_t __vec = (__v);			\
+    __builtin_arm_lane_check (8, __idx);	\
+    __vec[__arm_laneq (__vec, __idx)] = __elem;	\
+    __vec;					\
+  })
 #endif
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 6a54125..88e1c3d 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -99,7 +99,7 @@
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
 ;; Quad-width vector modes, with TImode added, for moves.
-(define_mode_iterator VQXMOV [V16QI V8HI V4SI V4SF V2DI TI])
+(define_mode_iterator VQXMOV [V16QI V8HI V8HF V4SI V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
 (define_mode_iterator VSTRUCT [EI OI CI XI])
@@ -114,7 +114,7 @@
 (define_mode_iterator VN [V8HI V4SI V2DI])
 
 ;; All supported vector modes (except singleton DImode).
-(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI])
+(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF V2DI])
 
 ;; All supported vector modes (except those with 64-bit integer elements).
 (define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
@@ -424,6 +424,7 @@
 ;; Register width from element mode
 (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
                          (V4HI "P") (V8HI  "q")
+                         (V4HF "P") (V8HF  "q")
                          (V2SI "P") (V4SI  "q")
                          (V2SF "P") (V4SF  "q")
                          (DI   "P") (V2DI  "q")
@@ -572,6 +573,7 @@
 (define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
                  (V4HI "false") (V8HI "false")
                  (V2SI "false") (V4SI "false")
+                 (V4HF "true") (V8HF "true")
                  (V2SF "true") (V4SF "true")
                  (DI "false") (V2DI "false")])
 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 62fb6da..5e0aed2 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -137,6 +137,30 @@
     }
 })
 
+(define_expand "movv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand")
+	(match_operand:V4HF 1 "s_register_operand"))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+	operands[1] = force_reg (V4HFmode, operands[1]);
+    }
+})
+
+(define_expand "movv8hf"
+  [(set (match_operand:V8HF 0 "")
+	(match_operand:V8HF 1 ""))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+	operands[1] = force_reg (V8HFmode, operands[1]);
+    }
+})
+
 (define_insn "*neon_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
 	(match_operand:VSTRUCT 1 "general_operand"	" w,w, Ut"))]
@@ -299,11 +323,11 @@
   [(set_attr "type" "neon_load1_1reg<q>")])
 
 (define_insn "vec_set<mode>_internal"
-  [(set (match_operand:VD 0 "s_register_operand" "=w,w")
-        (vec_merge:VD
-          (vec_duplicate:VD
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w,w")
+        (vec_merge:VD_LANE
+          (vec_duplicate:VD_LANE
             (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r"))
-          (match_operand:VD 3 "s_register_operand" "0,0")
+          (match_operand:VD_LANE 3 "s_register_operand" "0,0")
           (match_operand:SI 2 "immediate_operand" "i,i")))]
   "TARGET_NEON"
 {
@@ -385,7 +409,7 @@
 (define_insn "vec_extract<mode>"
   [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r")
         (vec_select:<V_elem>
-          (match_operand:VD 1 "s_register_operand" "w,w")
+          (match_operand:VD_LANE 1 "s_register_operand" "w,w")
           (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))]
   "TARGET_NEON"
 {
@@ -2806,6 +2830,22 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_from_gp<q>")]
 )
 
+(define_insn "neon_vdup_nv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V4HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%P0, %1"
+  [(set_attr "type" "neon_from_gp")]
+)
+
+(define_insn "neon_vdup_nv8hf"
+  [(set (match_operand:V8HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V8HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%q0, %1"
+  [(set_attr "type" "neon_from_gp_q")]
+)
+
 (define_insn "neon_vdup_n<mode>"
   [(set (match_operand:V32 0 "s_register_operand" "=w,w")
         (vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))]
@@ -4305,8 +4345,8 @@ if (BYTES_BIG_ENDIAN)
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VD 0 "s_register_operand" "=w")
-        (vec_duplicate:VD (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w")
+        (vec_duplicate:VD_LANE (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
   "TARGET_NEON"
   "vld1.<V_sz_elem>\t{%P0[]}, %A1"
   [(set_attr "type" "neon_load1_all_lanes<q>")]
@@ -4322,8 +4362,8 @@ if (BYTES_BIG_ENDIAN)
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VQ 0 "s_register_operand" "=w")
-        (vec_duplicate:VQ (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+  [(set (match_operand:VQ2 0 "s_register_operand" "=w")
+        (vec_duplicate:VQ2 (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
   "TARGET_NEON"
 {
   return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, %A1";
diff --git a/gcc/testsuite/gcc.target/arm/pr68620.c b/gcc/testsuite/gcc.target/arm/pr68620.c
new file mode 100644
index 0000000..984992f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr68620.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-options "-mfp16-format=ieee" } */
+/* { dg-add-options arm_fp } */
+
+#include "arm_neon.h"
+
+float16x4_t __attribute__((target("fpu=neon-fp16")))
+foo (float32x4_t arg)
+{
+    return vcvt_f16_f32 (arg);
+}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-20 21:10       ` Christophe Lyon
@ 2016-01-22 17:07         ` Alan Lawrence
  2016-01-25 14:31           ` Christophe Lyon
  2016-01-26 13:20         ` Kyrill Tkachov
  1 sibling, 1 reply; 9+ messages in thread
From: Alan Lawrence @ 2016-01-22 17:07 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On 20/01/16 21:10, Christophe Lyon wrote:
> On 19 January 2016 at 15:51, Alan Lawrence <alan.lawrence@foss.arm.com> wrote:
>> On 19/01/16 11:15, Christophe Lyon wrote:
>>
>>>>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>>>>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>>>>> it was not desirable to impact neon_vrev32<mode>.
>>>>
>>>>
>>>> Well, the same instruction will suffice for vrev32'ing vectors of HF just
>>>> as
>>>> well as vectors of HI, so I think I'd argue that's harmless enough. To
>>>> gain the
>>>> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases,
>>>> though.
>>>>
>>> Since this is more intrusive, I'd rather leave that part for later. OK?
>>
>>
>> Sure.
>>
>>>>> +#ifdef __ARM_BIG_ENDIAN
>>>>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
>>>>> +     right value for vectors with 8 lanes.  */
>>>>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>>>>> +#else
>>>>> +#define __arm_lane(__vec, __idx) __idx
>>>>> +#endif
>>>>> +
>>>>
>>>>
>>>> Looks right, but sounds... my concern here is that I'm hoping at some
>>>> point we
>>>> will move the *other* vget/set_lane intrinsics to use GCC vector
>>>> extensions
>>>> too. At which time (unlike __aarch64_lane which can be used everywhere)
>>>> this
>>>> will be the wrong formula. Can we name (and/or comment) it to avoid
>>>> misleading
>>>> anyone? The key characteristic seems to be that it is for vectors of
>>>> 16-bit
>>>> elements only.
>>>>
>>> I'm not to follow, here. Looking at the patterns for
>>> neon_vget_lane<mode>_*internal in neon.md,
>>> I can see 2 flavours: one for VD, one for VQ2. The latter uses "halfelts".
>>>
>>> Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
>>> that would be similar to the aarch64 ones (by computing the number of
>>> lanes of the input vector), but the "q" one would use half the total
>>> number of lanes instead?
>>
>>
>> That works for me! Sthg like:
>>
>> #define __arm_lane(__vec, __idx) NUM_LANES(__vec) - __idx
>> #define __arm_laneq(__vec, __idx) (__idx & (NUM_LANES(__vec)/2)) +
>> (NUM_LANES(__vec)/2 - __idx)
>> //or similarly
>> #define __arm_laneq(__vec, __idx) (__idx ^ (NUM_LANES(__vec)/2 - 1))
>>
>> Alternatively I'd been thinking
>>
>> #define __arm_lane_32xN(__idx) __idx ^ 1
>> #define __arm_lane_16xN(__idx) __idx ^ 3
>> #define __arm_lane_8xN(__idx) __idx ^ 7
>>
>> Bear in mind PR64893 that we had on AArch64 :-(
>>
>
> Here is a new version, based on the comments above.
> I've also removed the addition of arm_fp_ok effective target since I
> added that in my other testsuite patch.
>
> OK now?

Yes. I realize my worry about PR64893 doesn't apply here since we pass constant 
lane numbers / vector bounds into __builtin_arm_lane_check. Thanks!

--Alan

>
> Thanks,
>
> Christophe
>
>> Cheers, Alan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-22 17:07         ` Alan Lawrence
@ 2016-01-25 14:31           ` Christophe Lyon
  0 siblings, 0 replies; 9+ messages in thread
From: Christophe Lyon @ 2016-01-25 14:31 UTC (permalink / raw)
  To: Alan Lawrence; +Cc: gcc-patches

On 22 January 2016 at 18:06, Alan Lawrence <alan.lawrence@foss.arm.com> wrote:
> On 20/01/16 21:10, Christophe Lyon wrote:
>>
>> On 19 January 2016 at 15:51, Alan Lawrence <alan.lawrence@foss.arm.com>
>> wrote:
>>>
>>> On 19/01/16 11:15, Christophe Lyon wrote:
>>>
>>>>>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>>>>>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>>>>>> it was not desirable to impact neon_vrev32<mode>.
>>>>>
>>>>>
>>>>>
>>>>> Well, the same instruction will suffice for vrev32'ing vectors of HF
>>>>> just
>>>>> as
>>>>> well as vectors of HI, so I think I'd argue that's harmless enough. To
>>>>> gain the
>>>>> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases,
>>>>> though.
>>>>>
>>>> Since this is more intrusive, I'd rather leave that part for later. OK?
>>>
>>>
>>>
>>> Sure.
>>>
>>>>>> +#ifdef __ARM_BIG_ENDIAN
>>>>>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also
>>>>>> the
>>>>>> +     right value for vectors with 8 lanes.  */
>>>>>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>>>>>> +#else
>>>>>> +#define __arm_lane(__vec, __idx) __idx
>>>>>> +#endif
>>>>>> +
>>>>>
>>>>>
>>>>>
>>>>> Looks right, but sounds... my concern here is that I'm hoping at some
>>>>> point we
>>>>> will move the *other* vget/set_lane intrinsics to use GCC vector
>>>>> extensions
>>>>> too. At which time (unlike __aarch64_lane which can be used everywhere)
>>>>> this
>>>>> will be the wrong formula. Can we name (and/or comment) it to avoid
>>>>> misleading
>>>>> anyone? The key characteristic seems to be that it is for vectors of
>>>>> 16-bit
>>>>> elements only.
>>>>>
>>>> I'm not to follow, here. Looking at the patterns for
>>>> neon_vget_lane<mode>_*internal in neon.md,
>>>> I can see 2 flavours: one for VD, one for VQ2. The latter uses
>>>> "halfelts".
>>>>
>>>> Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
>>>> that would be similar to the aarch64 ones (by computing the number of
>>>> lanes of the input vector), but the "q" one would use half the total
>>>> number of lanes instead?
>>>
>>>
>>>
>>> That works for me! Sthg like:
>>>
>>> #define __arm_lane(__vec, __idx) NUM_LANES(__vec) - __idx
>>> #define __arm_laneq(__vec, __idx) (__idx & (NUM_LANES(__vec)/2)) +
>>> (NUM_LANES(__vec)/2 - __idx)
>>> //or similarly
>>> #define __arm_laneq(__vec, __idx) (__idx ^ (NUM_LANES(__vec)/2 - 1))
>>>
>>> Alternatively I'd been thinking
>>>
>>> #define __arm_lane_32xN(__idx) __idx ^ 1
>>> #define __arm_lane_16xN(__idx) __idx ^ 3
>>> #define __arm_lane_8xN(__idx) __idx ^ 7
>>>
>>> Bear in mind PR64893 that we had on AArch64 :-(
>>>
>>
>> Here is a new version, based on the comments above.
>> I've also removed the addition of arm_fp_ok effective target since I
>> added that in my other testsuite patch.
>>
>> OK now?
>
>
> Yes. I realize my worry about PR64893 doesn't apply here since we pass
> constant lane numbers / vector bounds into __builtin_arm_lane_check. Thanks!
>

Thanks, I guess I still have to wait for Kyrill/Ramana 's approval.

> --Alan
>
>>
>> Thanks,
>>
>> Christophe
>>
>>> Cheers, Alan
>
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-20 21:10       ` Christophe Lyon
  2016-01-22 17:07         ` Alan Lawrence
@ 2016-01-26 13:20         ` Kyrill Tkachov
  2016-01-26 15:18           ` Christophe Lyon
  1 sibling, 1 reply; 9+ messages in thread
From: Kyrill Tkachov @ 2016-01-26 13:20 UTC (permalink / raw)
  To: Christophe Lyon, Alan Lawrence; +Cc: gcc-patches

Hi Christophe,

On 20/01/16 21:10, Christophe Lyon wrote:
> On 19 January 2016 at 15:51, Alan Lawrence <alan.lawrence@foss.arm.com> wrote:
>> On 19/01/16 11:15, Christophe Lyon wrote:
>>
>>>>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>>>>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>>>>> it was not desirable to impact neon_vrev32<mode>.
>>>>
>>>> Well, the same instruction will suffice for vrev32'ing vectors of HF just
>>>> as
>>>> well as vectors of HI, so I think I'd argue that's harmless enough. To
>>>> gain the
>>>> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases,
>>>> though.
>>>>
>>> Since this is more intrusive, I'd rather leave that part for later. OK?
>>
>> Sure.
>>
>>>>> +#ifdef __ARM_BIG_ENDIAN
>>>>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
>>>>> +     right value for vectors with 8 lanes.  */
>>>>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>>>>> +#else
>>>>> +#define __arm_lane(__vec, __idx) __idx
>>>>> +#endif
>>>>> +
>>>>
>>>> Looks right, but sounds... my concern here is that I'm hoping at some
>>>> point we
>>>> will move the *other* vget/set_lane intrinsics to use GCC vector
>>>> extensions
>>>> too. At which time (unlike __aarch64_lane which can be used everywhere)
>>>> this
>>>> will be the wrong formula. Can we name (and/or comment) it to avoid
>>>> misleading
>>>> anyone? The key characteristic seems to be that it is for vectors of
>>>> 16-bit
>>>> elements only.
>>>>
>>> I'm not to follow, here. Looking at the patterns for
>>> neon_vget_lane<mode>_*internal in neon.md,
>>> I can see 2 flavours: one for VD, one for VQ2. The latter uses "halfelts".
>>>
>>> Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
>>> that would be similar to the aarch64 ones (by computing the number of
>>> lanes of the input vector), but the "q" one would use half the total
>>> number of lanes instead?
>>
>> That works for me! Sthg like:
>>
>> #define __arm_lane(__vec, __idx) NUM_LANES(__vec) - __idx
>> #define __arm_laneq(__vec, __idx) (__idx & (NUM_LANES(__vec)/2)) +
>> (NUM_LANES(__vec)/2 - __idx)
>> //or similarly
>> #define __arm_laneq(__vec, __idx) (__idx ^ (NUM_LANES(__vec)/2 - 1))
>>
>> Alternatively I'd been thinking
>>
>> #define __arm_lane_32xN(__idx) __idx ^ 1
>> #define __arm_lane_16xN(__idx) __idx ^ 3
>> #define __arm_lane_8xN(__idx) __idx ^ 7
>>
>> Bear in mind PR64893 that we had on AArch64 :-(
>>
> Here is a new version, based on the comments above.
> I've also removed the addition of arm_fp_ok effective target since I
> added that in my other testsuite patch.
>
> OK now?
>
> Thanks,
>
> Christophe
>

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3588b83..b1f408c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12370,6 +12370,10 @@ neon_valid_immediate (rtx op, machine_mode mode, int inverse,
        if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
          return -1;
  
+      /* FP16 vectors cannot be represented.  */
+      if (innersize == 2)
+	return -1;
+
        r0 = CONST_DOUBLE_REAL_VALUE (el0);


I think it'd be clearer to write "if (GET_MODE_INNER (mode) == HFmode)"

+(define_expand "movv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand")
+	(match_operand:V4HF 1 "s_register_operand"))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+	operands[1] = force_reg (V4HFmode, operands[1]);
+    }
+})

Can you please add a comment saying why you need the force_reg here?
IIRC it's because of CANNOT_CHANGE_MODE_CLASS on big-endian that causes an
ICE during expand with subregs.

I've tried this patch out and it does indeed fix the ICE on armeb.
So ok for trunk with the changes above.
Thanks,
Kyrill


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)
  2016-01-26 13:20         ` Kyrill Tkachov
@ 2016-01-26 15:18           ` Christophe Lyon
  0 siblings, 0 replies; 9+ messages in thread
From: Christophe Lyon @ 2016-01-26 15:18 UTC (permalink / raw)
  To: Kyrill Tkachov; +Cc: Alan Lawrence, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4181 bytes --]

On 26 January 2016 at 14:20, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
> Hi Christophe,
>
> On 20/01/16 21:10, Christophe Lyon wrote:
>>
>> On 19 January 2016 at 15:51, Alan Lawrence <alan.lawrence@foss.arm.com>
>> wrote:
>>>
>>> On 19/01/16 11:15, Christophe Lyon wrote:
>>>
>>>>>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>>>>>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>>>>>> it was not desirable to impact neon_vrev32<mode>.
>>>>>
>>>>>
>>>>> Well, the same instruction will suffice for vrev32'ing vectors of HF
>>>>> just
>>>>> as
>>>>> well as vectors of HI, so I think I'd argue that's harmless enough. To
>>>>> gain the
>>>>> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases,
>>>>> though.
>>>>>
>>>> Since this is more intrusive, I'd rather leave that part for later. OK?
>>>
>>>
>>> Sure.
>>>
>>>>>> +#ifdef __ARM_BIG_ENDIAN
>>>>>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also
>>>>>> the
>>>>>> +     right value for vectors with 8 lanes.  */
>>>>>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>>>>>> +#else
>>>>>> +#define __arm_lane(__vec, __idx) __idx
>>>>>> +#endif
>>>>>> +
>>>>>
>>>>>
>>>>> Looks right, but sounds... my concern here is that I'm hoping at some
>>>>> point we
>>>>> will move the *other* vget/set_lane intrinsics to use GCC vector
>>>>> extensions
>>>>> too. At which time (unlike __aarch64_lane which can be used everywhere)
>>>>> this
>>>>> will be the wrong formula. Can we name (and/or comment) it to avoid
>>>>> misleading
>>>>> anyone? The key characteristic seems to be that it is for vectors of
>>>>> 16-bit
>>>>> elements only.
>>>>>
>>>> I'm not to follow, here. Looking at the patterns for
>>>> neon_vget_lane<mode>_*internal in neon.md,
>>>> I can see 2 flavours: one for VD, one for VQ2. The latter uses
>>>> "halfelts".
>>>>
>>>> Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
>>>> that would be similar to the aarch64 ones (by computing the number of
>>>> lanes of the input vector), but the "q" one would use half the total
>>>> number of lanes instead?
>>>
>>>
>>> That works for me! Sthg like:
>>>
>>> #define __arm_lane(__vec, __idx) NUM_LANES(__vec) - __idx
>>> #define __arm_laneq(__vec, __idx) (__idx & (NUM_LANES(__vec)/2)) +
>>> (NUM_LANES(__vec)/2 - __idx)
>>> //or similarly
>>> #define __arm_laneq(__vec, __idx) (__idx ^ (NUM_LANES(__vec)/2 - 1))
>>>
>>> Alternatively I'd been thinking
>>>
>>> #define __arm_lane_32xN(__idx) __idx ^ 1
>>> #define __arm_lane_16xN(__idx) __idx ^ 3
>>> #define __arm_lane_8xN(__idx) __idx ^ 7
>>>
>>> Bear in mind PR64893 that we had on AArch64 :-(
>>>
>> Here is a new version, based on the comments above.
>> I've also removed the addition of arm_fp_ok effective target since I
>> added that in my other testsuite patch.
>>
>> OK now?
>>
>> Thanks,
>>
>> Christophe
>>
>
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index 3588b83..b1f408c 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -12370,6 +12370,10 @@ neon_valid_immediate (rtx op, machine_mode mode,
> int inverse,
>        if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE
> (el0)))
>          return -1;
>  +      /* FP16 vectors cannot be represented.  */
> +      if (innersize == 2)
> +       return -1;
> +
>        r0 = CONST_DOUBLE_REAL_VALUE (el0);
>
>
> I think it'd be clearer to write "if (GET_MODE_INNER (mode) == HFmode)"
>
> +(define_expand "movv4hf"
> +  [(set (match_operand:V4HF 0 "s_register_operand")
> +       (match_operand:V4HF 1 "s_register_operand"))]
> +  "TARGET_NEON && TARGET_FP16"
> +{
> +  if (can_create_pseudo_p ())
> +    {
> +      if (!REG_P (operands[0]))
> +       operands[1] = force_reg (V4HFmode, operands[1]);
> +    }
> +})
>
> Can you please add a comment saying why you need the force_reg here?
> IIRC it's because of CANNOT_CHANGE_MODE_CLASS on big-endian that causes an
> ICE during expand with subregs.
>
> I've tried this patch out and it does indeed fix the ICE on armeb.
> So ok for trunk with the changes above.
> Thanks,
> Kyrill
>
>

OK thanks, here is what I have committed (r232832).


Christophe.

[-- Attachment #2: pr68620.final.txt --]
[-- Type: text/plain, Size: 10018 bytes --]

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 232831)
+++ gcc/config/arm/arm.c	(working copy)
@@ -12381,6 +12381,10 @@
       if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
         return -1;
 
+      /* FP16 vectors cannot be represented.  */
+      if (GET_MODE_INNER (mode) == HFmode)
+	return -1;
+
       r0 = CONST_DOUBLE_REAL_VALUE (el0);
 
       for (i = 1; i < n_elts; i++)
Index: gcc/config/arm/arm_neon.h
===================================================================
--- gcc/config/arm/arm_neon.h	(revision 232831)
+++ gcc/config/arm/arm_neon.h	(working copy)
@@ -5302,16 +5302,28 @@
    were marked always-inline so there were no call sites, the declaration
    would nonetheless raise an error.  Hence, we must use a macro instead.  */
 
-#define vget_lane_f16(__v, __idx)		\
-  __extension__					\
-    ({						\
-      float16x4_t __vec = (__v);		\
-      __builtin_arm_lane_check (4, __idx);	\
-      float16_t __res = __vec[__idx];		\
-      __res;					\
-    })
+  /* For big-endian, GCC's vector indices are reversed within each 64
+     bits compared to the architectural lane indices used by Neon
+     intrinsics.  */
+#ifdef __ARM_BIG_ENDIAN
+#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))
+#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1))
+#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1))
+#else
+#define __arm_lane(__vec, __idx) __idx
+#define __arm_laneq(__vec, __idx) __idx
 #endif
 
+#define vget_lane_f16(__v, __idx)			\
+  __extension__						\
+  ({							\
+    float16x4_t __vec = (__v);				\
+    __builtin_arm_lane_check (4, __idx);		\
+    float16_t __res = __vec[__arm_lane(__vec, __idx)];	\
+    __res;						\
+  })
+#endif
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vget_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -5379,14 +5391,14 @@
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-#define vgetq_lane_f16(__v, __idx)		\
-  __extension__					\
-    ({						\
-      float16x8_t __vec = (__v);		\
-      __builtin_arm_lane_check (8, __idx);	\
-      float16_t __res = __vec[__idx];		\
-      __res;					\
-    })
+#define vgetq_lane_f16(__v, __idx)			\
+  __extension__						\
+  ({							\
+    float16x8_t __vec = (__v);				\
+    __builtin_arm_lane_check (8, __idx);		\
+    float16_t __res = __vec[__arm_laneq(__vec, __idx)];	\
+    __res;						\
+  })
 #endif
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
@@ -5458,13 +5470,13 @@
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 #define vset_lane_f16(__e, __v, __idx)		\
   __extension__					\
-    ({						\
-      float16_t __elem = (__e);			\
-      float16x4_t __vec = (__v);		\
-      __builtin_arm_lane_check (4, __idx);	\
-      __vec[__idx] = __elem;			\
-      __vec;					\
-    })
+  ({						\
+    float16_t __elem = (__e);			\
+    float16x4_t __vec = (__v);			\
+    __builtin_arm_lane_check (4, __idx);	\
+    __vec[__arm_lane (__vec, __idx)] = __elem;	\
+    __vec;					\
+  })
 #endif
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -5536,13 +5548,13 @@
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 #define vsetq_lane_f16(__e, __v, __idx)		\
   __extension__					\
-    ({						\
-      float16_t __elem = (__e);			\
-      float16x8_t __vec = (__v);		\
-      __builtin_arm_lane_check (8, __idx);	\
-      __vec[__idx] = __elem;			\
-      __vec;					\
-    })
+  ({						\
+    float16_t __elem = (__e);			\
+    float16x8_t __vec = (__v);			\
+    __builtin_arm_lane_check (8, __idx);	\
+    __vec[__arm_laneq (__vec, __idx)] = __elem;	\
+    __vec;					\
+  })
 #endif
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
Index: gcc/config/arm/iterators.md
===================================================================
--- gcc/config/arm/iterators.md	(revision 232831)
+++ gcc/config/arm/iterators.md	(working copy)
@@ -99,7 +99,7 @@
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
 ;; Quad-width vector modes, with TImode added, for moves.
-(define_mode_iterator VQXMOV [V16QI V8HI V4SI V4SF V2DI TI])
+(define_mode_iterator VQXMOV [V16QI V8HI V8HF V4SI V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
 (define_mode_iterator VSTRUCT [EI OI CI XI])
@@ -114,7 +114,7 @@
 (define_mode_iterator VN [V8HI V4SI V2DI])
 
 ;; All supported vector modes (except singleton DImode).
-(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI])
+(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF V2DI])
 
 ;; All supported vector modes (except those with 64-bit integer elements).
 (define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
@@ -428,6 +428,7 @@
 ;; Register width from element mode
 (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
                          (V4HI "P") (V8HI  "q")
+                         (V4HF "P") (V8HF  "q")
                          (V2SI "P") (V4SI  "q")
                          (V2SF "P") (V4SF  "q")
                          (DI   "P") (V2DI  "q")
@@ -576,6 +577,7 @@
 (define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
                  (V4HI "false") (V8HI "false")
                  (V2SI "false") (V4SI "false")
+                 (V4HF "true") (V8HF "true")
                  (V2SF "true") (V4SF "true")
                  (DI "false") (V2DI "false")])
 
Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md	(revision 232831)
+++ gcc/config/arm/neon.md	(working copy)
@@ -137,6 +137,36 @@
     }
 })
 
+(define_expand "movv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand")
+	(match_operand:V4HF 1 "s_register_operand"))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  /* We need to use force_reg to avoid CANNOT_CHANGE_MODE_CLASS
+     causing an ICE on big-endian because it cannot extract subregs in
+     this case.  */
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+	operands[1] = force_reg (V4HFmode, operands[1]);
+    }
+})
+
+(define_expand "movv8hf"
+  [(set (match_operand:V8HF 0 "")
+	(match_operand:V8HF 1 ""))]
+  "TARGET_NEON && TARGET_FP16"
+{ 
+  /* We need to use force_reg to avoid CANNOT_CHANGE_MODE_CLASS
+     causing an ICE on big-endian because it cannot extract subregs in
+     this case.  */
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+	operands[1] = force_reg (V8HFmode, operands[1]);
+    }
+})
+
 (define_insn "*neon_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
 	(match_operand:VSTRUCT 1 "general_operand"	" w,w, Ut"))]
@@ -299,11 +329,11 @@
   [(set_attr "type" "neon_load1_1reg<q>")])
 
 (define_insn "vec_set<mode>_internal"
-  [(set (match_operand:VD 0 "s_register_operand" "=w,w")
-        (vec_merge:VD
-          (vec_duplicate:VD
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w,w")
+        (vec_merge:VD_LANE
+          (vec_duplicate:VD_LANE
             (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r"))
-          (match_operand:VD 3 "s_register_operand" "0,0")
+          (match_operand:VD_LANE 3 "s_register_operand" "0,0")
           (match_operand:SI 2 "immediate_operand" "i,i")))]
   "TARGET_NEON"
 {
@@ -385,7 +415,7 @@
 (define_insn "vec_extract<mode>"
   [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r")
         (vec_select:<V_elem>
-          (match_operand:VD 1 "s_register_operand" "w,w")
+          (match_operand:VD_LANE 1 "s_register_operand" "w,w")
           (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))]
   "TARGET_NEON"
 {
@@ -2829,6 +2859,22 @@
   [(set_attr "type" "neon_from_gp<q>")]
 )
 
+(define_insn "neon_vdup_nv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V4HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%P0, %1"
+  [(set_attr "type" "neon_from_gp")]
+)
+
+(define_insn "neon_vdup_nv8hf"
+  [(set (match_operand:V8HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V8HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%q0, %1"
+  [(set_attr "type" "neon_from_gp_q")]
+)
+
 (define_insn "neon_vdup_n<mode>"
   [(set (match_operand:V32 0 "s_register_operand" "=w,w")
         (vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))]
@@ -4361,8 +4407,8 @@
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VD 0 "s_register_operand" "=w")
-        (vec_duplicate:VD (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w")
+        (vec_duplicate:VD_LANE (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
   "TARGET_NEON"
   "vld1.<V_sz_elem>\t{%P0[]}, %A1"
   [(set_attr "type" "neon_load1_all_lanes<q>")]
@@ -4378,8 +4424,8 @@
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VQ 0 "s_register_operand" "=w")
-        (vec_duplicate:VQ (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+  [(set (match_operand:VQ2 0 "s_register_operand" "=w")
+        (vec_duplicate:VQ2 (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
   "TARGET_NEON"
 {
   return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, %A1";
Index: gcc/testsuite/gcc.target/arm/pr68620.c
===================================================================
--- gcc/testsuite/gcc.target/arm/pr68620.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/pr68620.c	(working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-options "-mfp16-format=ieee" } */
+/* { dg-add-options arm_fp } */
+
+#include "arm_neon.h"
+
+float16x4_t __attribute__((target("fpu=neon-fp16")))
+foo (float32x4_t arg)
+{
+    return vcvt_f16_f32 (arg);
+}

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-01-26 15:18 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-15 10:39 [PATCH] ARM PR68620 (ICE with FP16 on armeb) Christophe Lyon
2016-01-18 19:02 ` Alan Lawrence
2016-01-19 11:15   ` Christophe Lyon
2016-01-19 14:51     ` Alan Lawrence
2016-01-20 21:10       ` Christophe Lyon
2016-01-22 17:07         ` Alan Lawrence
2016-01-25 14:31           ` Christophe Lyon
2016-01-26 13:20         ` Kyrill Tkachov
2016-01-26 15:18           ` Christophe Lyon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).