[AArch64_be] Fix vtbl[34] and vtbx4

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [AArch64_be] Fix vtbl[34] and vtbx4
@ 2015-09-15 16:25 Christophe Lyon
  2015-09-29 21:26 ` Christophe Lyon
  2015-10-07 15:09 ` James Greenhalgh
  0 siblings, 2 replies; 10+ messages in thread
From: Christophe Lyon @ 2015-09-15 16:25 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 225 bytes --]

This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
existing builtins, and fixes the behaviour on aarch64_be.

Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.

OK?

Christophe.

[-- Attachment #2: vtbX.txt --]
[-- Type: text/plain, Size: 542 bytes --]

2015-09-15  Christophe Lyon  <christophe.lyon@linaro.org>

	* config/aarch64/aarch64-builtins.c
	(aarch64_types_tbl_qualifiers): New static data.
	(TYPES_TBL): Define.
	* config/aarch64/aarch64-simd-builtins.def: Update builtins
	tables.
	* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
	* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
	(vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
	(vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
	intrinsics.
	* config/aarch64/iterators.md (V8Q): New.

[-- Attachment #3: vtbX.patch --]
[-- Type: text/x-patch, Size: 9898 bytes --]

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 0f4f2b9..7ca3917 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
       qualifier_none, qualifier_struct_load_store_lane_index };
 #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
 
+static enum aarch64_type_qualifiers
+aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none };
+#define TYPES_TBL (aarch64_types_tbl_qualifiers)
+
 #define CF0(N, X) CODE_FOR_aarch64_##N##X
 #define CF1(N, X) CODE_FOR_##N##X##1
 #define CF2(N, X) CODE_FOR_##N##X##2
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d0f298a..62f1b13 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -405,3 +405,5 @@
   VAR1 (BINOPP, crypto_pmull, 0, di)
   VAR1 (BINOPP, crypto_pmull, 0, v2di)
 
+  /* Implemented by aarch64_tbl3v8qi.  */
+  BUILTIN_V8Q (TBL, tbl3, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9777418..84a61d5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4716,6 +4714,16 @@
   [(set_attr "type" "neon_tbl2_q")]
 )
 
+(define_insn "aarch64_tbl3v8qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=w")
+	(unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
+		      (match_operand:V8QI 2 "register_operand" "w")]
+		      UNSPEC_TBL))]
+  "TARGET_SIMD"
+  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
+  [(set_attr "type" "neon_tbl3")]
+)
+
 (define_insn_and_split "aarch64_combinev16qi"
   [(set (match_operand:OI 0 "register_operand" "=w")
 	(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 87bbf6e..91704de 100644
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6dfebe7..e8ee318 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10902,13 +10902,14 @@ vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
 {
   int8x8_t result;
   int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
   return result;
 }
 
@@ -10917,13 +10918,14 @@ vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
   uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10932,13 +10934,14 @@ vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
   poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10947,13 +10950,14 @@ vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
 {
   int8x8_t result;
   int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
   return result;
 }
 
@@ -10962,13 +10966,14 @@ vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
   uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10977,13 +10982,14 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
   poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -11023,51 +11029,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
-{
-  int8x8_t result = r;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
-{
-  uint8x8_t result = r;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
-{
-  poly8x8_t result = r;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
 /* End of temporary inline asm.  */
 
 /* Start of optimal implementations in approved order.  */
@@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
   return vbsl_p8 (__mask, __tbl, __r);
 }
 
+/* vtbx4  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (32));
+  int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
+  uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
+  poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
+}
+
 /* vtrn */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b8a45d1..dfbd9cd 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -100,6 +100,8 @@
 ;; All modes.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
+(define_mode_iterator V8Q [V8QI])
+
 ;; All vector modes and DI.
 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])
 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-09-15 16:25 [AArch64_be] Fix vtbl[34] and vtbx4 Christophe Lyon
@ 2015-09-29 21:26 ` Christophe Lyon
  2015-10-07  9:24   ` Christophe Lyon
  2015-10-07 15:09 ` James Greenhalgh
  1 sibling, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-09-29 21:26 UTC (permalink / raw)
  To: gcc-patches

Ping?


On 15 September 2015 at 18:25, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
> existing builtins, and fixes the behaviour on aarch64_be.
>
> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>
> OK?
>
> Christophe.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-09-29 21:26 ` Christophe Lyon
@ 2015-10-07  9:24   ` Christophe Lyon
  0 siblings, 0 replies; 10+ messages in thread
From: Christophe Lyon @ 2015-10-07  9:24 UTC (permalink / raw)
  To: gcc-patches

Ping?
https://gcc.gnu.org/ml/gcc-patches/2015-09/msg01096.html

On 29 September 2015 at 22:57, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
> Ping?
>
>
> On 15 September 2015 at 18:25, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
>> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
>> existing builtins, and fixes the behaviour on aarch64_be.
>>
>> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>>
>> OK?
>>
>> Christophe.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-09-15 16:25 [AArch64_be] Fix vtbl[34] and vtbx4 Christophe Lyon
  2015-09-29 21:26 ` Christophe Lyon
@ 2015-10-07 15:09 ` James Greenhalgh
  2015-10-07 20:07   ` Christophe Lyon
  1 sibling, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2015-10-07 15:09 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
> existing builtins, and fixes the behaviour on aarch64_be.
> 
> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
> 
> OK?

Hi Christophe,

Sorry for the delay getting back to you, comments below.

> 2015-09-15  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	* config/aarch64/aarch64-builtins.c
> 	(aarch64_types_tbl_qualifiers): New static data.
> 	(TYPES_TBL): Define.
> 	* config/aarch64/aarch64-simd-builtins.def: Update builtins
> 	tables.
> 	* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
> 	* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
> 	(vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
> 	(vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
> 	intrinsics.
> 	* config/aarch64/iterators.md (V8Q): New.

> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index 0f4f2b9..7ca3917 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>        qualifier_none, qualifier_struct_load_store_lane_index };
>  #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
>  
> +static enum aarch64_type_qualifiers
> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
> +  = { qualifier_none, qualifier_none, qualifier_none };
> +#define TYPES_TBL (aarch64_types_tbl_qualifiers)
> +

Do we need these? This looks like TYPES_BINOP (the predicate on the
instruction pattern will prevent the "qualifier_maybe_immediate" from
becoming a problem).

>  #define CF0(N, X) CODE_FOR_aarch64_##N##X
>  #define CF1(N, X) CODE_FOR_##N##X##1
>  #define CF2(N, X) CODE_FOR_##N##X##2
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index d0f298a..62f1b13 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -405,3 +405,5 @@
>    VAR1 (BINOPP, crypto_pmull, 0, di)
>    VAR1 (BINOPP, crypto_pmull, 0, v2di)
>  
> +  /* Implemented by aarch64_tbl3v8qi.  */
> +  BUILTIN_V8Q (TBL, tbl3, 0)

This can be:

  VAR1 (BINOP, tbl3, 0, v8qi)

It would be good if we could eliminate the casts in arm_neon.h by also
defining a  "BINOPU" version of this, but I imagine that gets stuck on the
types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about
making that change.


> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 9777418..84a61d5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4716,6 +4714,16 @@
>    [(set_attr "type" "neon_tbl2_q")]
>  )
>  
> +(define_insn "aarch64_tbl3v8qi"
> +  [(set (match_operand:V8QI 0 "register_operand" "=w")
> +	(unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
> +		      (match_operand:V8QI 2 "register_operand" "w")]
> +		      UNSPEC_TBL))]
> +  "TARGET_SIMD"
> +  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
> +  [(set_attr "type" "neon_tbl3")]
> +)
> +
>  (define_insn_and_split "aarch64_combinev16qi"
>    [(set (match_operand:OI 0 "register_operand" "=w")
>  	(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 87bbf6e..91704de 100644
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 6dfebe7..e8ee318 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
>  /* End of temporary inline asm.  */
>  
>  /* Start of optimal implementations in approved order.  */
> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
>    return vbsl_p8 (__mask, __tbl, __r);
>  }
>  
> +/* vtbx4  */
> +
> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
> +{
> +  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
> +			      vmov_n_u8 (32));
> +  int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
> +
> +  return vbsl_s8 (__mask, __tbl, __r);
> +}
> +
> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
> +{
> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
> +  uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
> +
> +  return vbsl_u8 (__mask, __tbl, __r);
> +}
> +
> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
> +{
> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
> +  poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
> +
> +  return vbsl_p8 (__mask, __tbl, __r);
> +}
> +

Why do we want this for vtbx4 rather than putting out a VTBX instruction
directly (as in the inline asm versions you replace)?

This sequence does make sense for vtbx3.

>  /* vtrn */
>  
>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index b8a45d1..dfbd9cd 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -100,6 +100,8 @@
>  ;; All modes.
>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>  
> +(define_mode_iterator V8Q [V8QI])
> +

This can be dropped if you use VAR1 in aarch64-builtins.c.

Thanks for working on this, with your patch applied, the only
remaining intrinsics I see failing for aarch64_be are:

  vqtbl2_*8
  vqtbl2q_*8
  vqtbl3_*8
  vqtbl3q_*8
  vqtbl4_*8
  vqtbl4q_*8

  vqtbx2_*8
  vqtbx2q_*8
  vqtbx3_*8
  vqtbx3q_*8
  vqtbx4_*8
  vqtbx4q_*8

Thanks,
James

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-10-07 15:09 ` James Greenhalgh
@ 2015-10-07 20:07   ` Christophe Lyon
  2015-10-08  9:12     ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-10-07 20:07 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches

On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
>> existing builtins, and fixes the behaviour on aarch64_be.
>>
>> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>>
>> OK?
>
> Hi Christophe,
>
> Sorry for the delay getting back to you, comments below.
>
>> 2015-09-15  Christophe Lyon  <christophe.lyon@linaro.org>
>>
>>       * config/aarch64/aarch64-builtins.c
>>       (aarch64_types_tbl_qualifiers): New static data.
>>       (TYPES_TBL): Define.
>>       * config/aarch64/aarch64-simd-builtins.def: Update builtins
>>       tables.
>>       * config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
>>       * config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
>>       (vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
>>       (vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
>>       intrinsics.
>>       * config/aarch64/iterators.md (V8Q): New.
>
>> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
>> index 0f4f2b9..7ca3917 100644
>> --- a/gcc/config/aarch64/aarch64-builtins.c
>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>> @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>>        qualifier_none, qualifier_struct_load_store_lane_index };
>>  #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
>>
>> +static enum aarch64_type_qualifiers
>> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>> +  = { qualifier_none, qualifier_none, qualifier_none };
>> +#define TYPES_TBL (aarch64_types_tbl_qualifiers)
>> +
>
> Do we need these? This looks like TYPES_BINOP (the predicate on the
> instruction pattern will prevent the "qualifier_maybe_immediate" from
> becoming a problem).
>
I'll give it a try, indeed I feared "qualifier_maybe_immediate" would
cause problems.

>>  #define CF0(N, X) CODE_FOR_aarch64_##N##X
>>  #define CF1(N, X) CODE_FOR_##N##X##1
>>  #define CF2(N, X) CODE_FOR_##N##X##2
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index d0f298a..62f1b13 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -405,3 +405,5 @@
>>    VAR1 (BINOPP, crypto_pmull, 0, di)
>>    VAR1 (BINOPP, crypto_pmull, 0, v2di)
>>
>> +  /* Implemented by aarch64_tbl3v8qi.  */
>> +  BUILTIN_V8Q (TBL, tbl3, 0)
>
> This can be:
>
>   VAR1 (BINOP, tbl3, 0, v8qi)
>
> It would be good if we could eliminate the casts in arm_neon.h by also
> defining a  "BINOPU" version of this, but I imagine that gets stuck on the
> types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about
> making that change.
OK

>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index 9777418..84a61d5 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -4716,6 +4714,16 @@
>>    [(set_attr "type" "neon_tbl2_q")]
>>  )
>>
>> +(define_insn "aarch64_tbl3v8qi"
>> +  [(set (match_operand:V8QI 0 "register_operand" "=w")
>> +     (unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
>> +                   (match_operand:V8QI 2 "register_operand" "w")]
>> +                   UNSPEC_TBL))]
>> +  "TARGET_SIMD"
>> +  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
>> +  [(set_attr "type" "neon_tbl3")]
>> +)
>> +
>>  (define_insn_and_split "aarch64_combinev16qi"
>>    [(set (match_operand:OI 0 "register_operand" "=w")
>>       (unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 87bbf6e..91704de 100644
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index 6dfebe7..e8ee318 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>>  /* End of temporary inline asm.  */
>>
>>  /* Start of optimal implementations in approved order.  */
>> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
>>    return vbsl_p8 (__mask, __tbl, __r);
>>  }
>>
>> +/* vtbx4  */
>> +
>> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
>> +                           vmov_n_u8 (32));
>> +  int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
>> +
>> +  return vbsl_s8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> +  uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
>> +
>> +  return vbsl_u8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> +  poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
>> +
>> +  return vbsl_p8 (__mask, __tbl, __r);
>> +}
>> +
>
> Why do we want this for vtbx4 rather than putting out a VTBX instruction
> directly (as in the inline asm versions you replace)?
>
I just followed the pattern used for vtbx3.

> This sequence does make sense for vtbx3.
In fact, I don't see why vtbx3 and vtbx4 should be different?

>>  /* vtrn */
>>
>>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> index b8a45d1..dfbd9cd 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -100,6 +100,8 @@
>>  ;; All modes.
>>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>>
>> +(define_mode_iterator V8Q [V8QI])
>> +
>
> This can be dropped if you use VAR1 in aarch64-builtins.c.
>
> Thanks for working on this, with your patch applied, the only
> remaining intrinsics I see failing for aarch64_be are:
>
>   vqtbl2_*8
>   vqtbl2q_*8
>   vqtbl3_*8
>   vqtbl3q_*8
>   vqtbl4_*8
>   vqtbl4q_*8
>
>   vqtbx2_*8
>   vqtbx2q_*8
>   vqtbx3_*8
>   vqtbx3q_*8
>   vqtbx4_*8
>   vqtbx4q_*8
>
Quite possibly. Which tests are you looking at? Since these are
aarch64-specific, they are not part of the
tests I added (advsimd-intrinsics). Do you mean
gcc.target/aarch64/table-intrinsics.c?


> Thanks,
> James
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-10-07 20:07   ` Christophe Lyon
@ 2015-10-08  9:12     ` James Greenhalgh
  2015-10-09 16:16       ` Christophe Lyon
  0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2015-10-08  9:12 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> >
> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
> > directly (as in the inline asm versions you replace)?
> >
> I just followed the pattern used for vtbx3.
> 
> > This sequence does make sense for vtbx3.
> In fact, I don't see why vtbx3 and vtbx4 should be different?

The difference between TBL and TBX is in their handling of a request to
select an out-of-range value. For TBL this returns zero, for TBX this
returns the value which was already in the destination register.

Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
(so two of them togather allow selecting elements in the range 0-31), and
vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
vectors (allowing elements in the range 0-23), we need to manually check for
values which would have been out-of-range on AArch32, but are not out
of range for AArch64 and handle them appropriately. For vtbx4 on the other
hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
the range 0..31, so we don't need the special masked handling.

You can find the suggested instruction sequences for the Neon intrinsics
in this document:

  http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf

> >>  /* vtrn */
> >>
> >>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> >> index b8a45d1..dfbd9cd 100644
> >> --- a/gcc/config/aarch64/iterators.md
> >> +++ b/gcc/config/aarch64/iterators.md
> >> @@ -100,6 +100,8 @@
> >>  ;; All modes.
> >>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
> >>
> >> +(define_mode_iterator V8Q [V8QI])
> >> +
> >
> > This can be dropped if you use VAR1 in aarch64-builtins.c.
> >
> > Thanks for working on this, with your patch applied, the only
> > remaining intrinsics I see failing for aarch64_be are:
> >
> >   vqtbl2_*8
> >   vqtbl2q_*8
> >   vqtbl3_*8
> >   vqtbl3q_*8
> >   vqtbl4_*8
> >   vqtbl4q_*8
> >
> >   vqtbx2_*8
> >   vqtbx2q_*8
> >   vqtbx3_*8
> >   vqtbx3q_*8
> >   vqtbx4_*8
> >   vqtbx4q_*8
> >
> Quite possibly. Which tests are you looking at? Since these are
> aarch64-specific, they are not part of the
> tests I added (advsimd-intrinsics). Do you mean
> gcc.target/aarch64/table-intrinsics.c?

Sorry, yes I should have given a reference. I'm running with a variant of
a testcase from the LLVM test-suite repository:

  SingleSource/UnitTests/Vector/AArch64/aarch64_neon_intrinsics.c

This has an execute test for most of the intrinsics specified for AArch64.
It needs some modification to cover the intrinsics we don't implement yet.

Thanks,
James

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-10-08  9:12     ` James Greenhalgh
@ 2015-10-09 16:16       ` Christophe Lyon
  2015-10-12 13:30         ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-10-09 16:16 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3335 bytes --]

On 8 October 2015 at 11:12, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
>> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> >
>> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
>> > directly (as in the inline asm versions you replace)?
>> >
>> I just followed the pattern used for vtbx3.
>>
>> > This sequence does make sense for vtbx3.
>> In fact, I don't see why vtbx3 and vtbx4 should be different?
>
> The difference between TBL and TBX is in their handling of a request to
> select an out-of-range value. For TBL this returns zero, for TBX this
> returns the value which was already in the destination register.
>
> Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
> (so two of them togather allow selecting elements in the range 0-31), and
> vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
> vectors (allowing elements in the range 0-23), we need to manually check for
> values which would have been out-of-range on AArch32, but are not out
> of range for AArch64 and handle them appropriately. For vtbx4 on the other
> hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
> the range 0..31, so we don't need the special masked handling.
>
> You can find the suggested instruction sequences for the Neon intrinsics
> in this document:
>
>   http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>

Hi James,

Please find attached an updated version which hopefully addresses your comments.
Tested on aarch64-none-elf and aarch64_be-none-elf using the Foundation Model.

OK?

Christophe.

>> >>  /* vtrn */
>> >>
>> >>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>> >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> >> index b8a45d1..dfbd9cd 100644
>> >> --- a/gcc/config/aarch64/iterators.md
>> >> +++ b/gcc/config/aarch64/iterators.md
>> >> @@ -100,6 +100,8 @@
>> >>  ;; All modes.
>> >>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>> >>
>> >> +(define_mode_iterator V8Q [V8QI])
>> >> +
>> >
>> > This can be dropped if you use VAR1 in aarch64-builtins.c.
>> >
>> > Thanks for working on this, with your patch applied, the only
>> > remaining intrinsics I see failing for aarch64_be are:
>> >
>> >   vqtbl2_*8
>> >   vqtbl2q_*8
>> >   vqtbl3_*8
>> >   vqtbl3q_*8
>> >   vqtbl4_*8
>> >   vqtbl4q_*8
>> >
>> >   vqtbx2_*8
>> >   vqtbx2q_*8
>> >   vqtbx3_*8
>> >   vqtbx3q_*8
>> >   vqtbx4_*8
>> >   vqtbx4q_*8
>> >
>> Quite possibly. Which tests are you looking at? Since these are
>> aarch64-specific, they are not part of the
>> tests I added (advsimd-intrinsics). Do you mean
>> gcc.target/aarch64/table-intrinsics.c?
>
> Sorry, yes I should have given a reference. I'm running with a variant of
> a testcase from the LLVM test-suite repository:
>
>   SingleSource/UnitTests/Vector/AArch64/aarch64_neon_intrinsics.c
>
> This has an execute test for most of the intrinsics specified for AArch64.
> It needs some modification to cover the intrinsics we don't implement yet.
>
> Thanks,
> James
>

[-- Attachment #2: vtbX.txt --]
[-- Type: text/plain, Size: 440 bytes --]

2015-10-09  Christophe Lyon  <christophe.lyon@linaro.org>

	* config/aarch64/aarch64-simd-builtins.def: Update builtins
	tables: add tbl3 and tbx4.
	* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
	(aarch64_tbx4v8qi): New.
	* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
	(vtbl4_s8, vtbl4_u8, vtbl4_p8, vtbx4_s8, vtbx4_u8, vtbx4_p8):
	Rewrite using builtin functions.
	* config/aarch64/iterators.md (UNSPEC_TBX): New.

[-- Attachment #3: vtbX.patch --]
[-- Type: text/x-patch, Size: 10559 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d0f298a..c16e82c9 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -405,3 +405,8 @@
   VAR1 (BINOPP, crypto_pmull, 0, di)
   VAR1 (BINOPP, crypto_pmull, 0, v2di)
 
+  /* Implemented by aarch64_tbl3v8qi.  */
+  VAR1 (BINOP, tbl3, 0, v8qi)
+
+  /* Implemented by aarch64_tbx4v8qi.  */
+  VAR1 (TERNOP, tbx4, 0, v8qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9777418..6027582 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4716,6 +4714,27 @@
   [(set_attr "type" "neon_tbl2_q")]
 )
 
+(define_insn "aarch64_tbl3v8qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=w")
+	(unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
+		      (match_operand:V8QI 2 "register_operand" "w")]
+		      UNSPEC_TBL))]
+  "TARGET_SIMD"
+  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
+  [(set_attr "type" "neon_tbl3")]
+)
+
+(define_insn "aarch64_tbx4v8qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=w")
+	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0")
+		      (match_operand:OI 2 "register_operand" "w")
+		      (match_operand:V8QI 3 "register_operand" "w")]
+		      UNSPEC_TBX))]
+  "TARGET_SIMD"
+  "tbx\\t%S0.8b, {%S2.16b - %T2.16b}, %S3.8b"
+  [(set_attr "type" "neon_tbl4")]
+)
+
 (define_insn_and_split "aarch64_combinev16qi"
   [(set (match_operand:OI 0 "register_operand" "=w")
 	(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6dfebe7..e99819e 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10902,13 +10902,14 @@ vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
 {
   int8x8_t result;
   int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
   return result;
 }
 
@@ -10917,13 +10918,14 @@ vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
   uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10932,13 +10934,14 @@ vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
   poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10947,13 +10950,14 @@ vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
 {
   int8x8_t result;
   int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
   return result;
 }
 
@@ -10962,13 +10966,14 @@ vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
   uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10977,13 +10982,14 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
   poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -11023,51 +11029,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
-{
-  int8x8_t result = r;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
-{
-  uint8x8_t result = r;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
-{
-  poly8x8_t result = r;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
 /* End of temporary inline asm.  */
 
 /* Start of optimal implementations in approved order.  */
@@ -23221,6 +23182,58 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
   return vbsl_p8 (__mask, __tbl, __r);
 }
 
+/* vtbx4  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
+{
+  int8x8_t result;
+  int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
+  temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
+  return result;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t result;
+  uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
+  temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+						  (int8x8_t)__idx);
+  return result;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
+{
+  poly8x8_t result;
+  poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
+  temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
+  temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+						  (int8x8_t)__idx);
+  return result;
+}
+
 /* vtrn */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b8a45d1..d856117 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -253,6 +253,7 @@
     UNSPEC_USHLL	; Used in aarch64-simd.md.
     UNSPEC_ADDP		; Used in aarch64-simd.md.
     UNSPEC_TBL		; Used in vector permute patterns.
+    UNSPEC_TBX		; Used in vector permute patterns.
     UNSPEC_CONCAT	; Used in vector permute patterns.
     UNSPEC_ZIP1		; Used in vector permute patterns.
     UNSPEC_ZIP2		; Used in vector permute patterns.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-10-09 16:16       ` Christophe Lyon
@ 2015-10-12 13:30         ` James Greenhalgh
  2015-10-13 13:05           ` Christophe Lyon
  0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2015-10-12 13:30 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On Fri, Oct 09, 2015 at 05:16:05PM +0100, Christophe Lyon wrote:
> On 8 October 2015 at 11:12, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> > On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
> >> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> >> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> >> >
> >> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
> >> > directly (as in the inline asm versions you replace)?
> >> >
> >> I just followed the pattern used for vtbx3.
> >>
> >> > This sequence does make sense for vtbx3.
> >> In fact, I don't see why vtbx3 and vtbx4 should be different?
> >
> > The difference between TBL and TBX is in their handling of a request to
> > select an out-of-range value. For TBL this returns zero, for TBX this
> > returns the value which was already in the destination register.
> >
> > Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
> > (so two of them togather allow selecting elements in the range 0-31), and
> > vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
> > vectors (allowing elements in the range 0-23), we need to manually check for
> > values which would have been out-of-range on AArch32, but are not out
> > of range for AArch64 and handle them appropriately. For vtbx4 on the other
> > hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
> > the range 0..31, so we don't need the special masked handling.
> >
> > You can find the suggested instruction sequences for the Neon intrinsics
> > in this document:
> >
> >   http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
> >
> 
> Hi James,
> 
> Please find attached an updated version which hopefully addresses your comments.
> Tested on aarch64-none-elf and aarch64_be-none-elf using the Foundation Model.
> 
> OK?

Looks good to me,

Thanks,
James

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-10-12 13:30         ` James Greenhalgh
@ 2015-10-13 13:05           ` Christophe Lyon
  2015-10-13 13:08             ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-10-13 13:05 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2411 bytes --]

On 12 October 2015 at 15:30, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Fri, Oct 09, 2015 at 05:16:05PM +0100, Christophe Lyon wrote:
>> On 8 October 2015 at 11:12, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>> > On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
>> >> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>> >> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> >> >
>> >> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
>> >> > directly (as in the inline asm versions you replace)?
>> >> >
>> >> I just followed the pattern used for vtbx3.
>> >>
>> >> > This sequence does make sense for vtbx3.
>> >> In fact, I don't see why vtbx3 and vtbx4 should be different?
>> >
>> > The difference between TBL and TBX is in their handling of a request to
>> > select an out-of-range value. For TBL this returns zero, for TBX this
>> > returns the value which was already in the destination register.
>> >
>> > Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
>> > (so two of them togather allow selecting elements in the range 0-31), and
>> > vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
>> > vectors (allowing elements in the range 0-23), we need to manually check for
>> > values which would have been out-of-range on AArch32, but are not out
>> > of range for AArch64 and handle them appropriately. For vtbx4 on the other
>> > hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
>> > the range 0..31, so we don't need the special masked handling.
>> >
>> > You can find the suggested instruction sequences for the Neon intrinsics
>> > in this document:
>> >
>> >   http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>> >
>>
>> Hi James,
>>
>> Please find attached an updated version which hopefully addresses your comments.
>> Tested on aarch64-none-elf and aarch64_be-none-elf using the Foundation Model.
>>
>> OK?
>
> Looks good to me,
>
> Thanks,
> James
>

I commited this as r228716, and noticed later that
gcc.target/aarch64/table-intrinsics.c failed because of this patch.

This is because that testcase scans the assembly for 'tbl v' or 'tbx
v', but since I replaced some asm statements,
the space is now a tab.

I plan to commit this (probably obvious?):

[-- Attachment #2: table-intr.txt --]
[-- Type: text/plain, Size: 168 bytes --]

2015-10-13  Christophe Lyon  <christophe.lyon@linaro.org>

	* gcc/testsuite/gcc.target/aarch64/table-intrinsics.c: Fix regexp
	after r228716 (Fix vtbl[34] and vtbx4).


[-- Attachment #3: table-intr.patch --]
[-- Type: text/x-patch, Size: 573 bytes --]

Index: gcc/testsuite/gcc.target/aarch64/table-intrinsics.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/table-intrinsics.c	(revision 228759)
+++ gcc/testsuite/gcc.target/aarch64/table-intrinsics.c	(working copy)
@@ -435,5 +435,5 @@
   return vqtbx4q_p8 (r, tab, idx);
 }
 
-/* { dg-final { scan-assembler-times "tbl v" 42} }  */
-/* { dg-final { scan-assembler-times "tbx v" 30} }  */
+/* { dg-final { scan-assembler-times "tbl\[ |\t\]*v" 42} }  */
+/* { dg-final { scan-assembler-times "tbx\[ |\t\]*v" 30} }  */

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64_be] Fix vtbl[34] and vtbx4
  2015-10-13 13:05           ` Christophe Lyon
@ 2015-10-13 13:08             ` James Greenhalgh
  0 siblings, 0 replies; 10+ messages in thread
From: James Greenhalgh @ 2015-10-13 13:08 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On Tue, Oct 13, 2015 at 02:05:01PM +0100, Christophe Lyon wrote:
> I commited this as r228716, and noticed later that
> gcc.target/aarch64/table-intrinsics.c failed because of this patch.
> 
> This is because that testcase scans the assembly for 'tbl v' or 'tbx
> v', but since I replaced some asm statements,
> the space is now a tab.
> 
> I plan to commit this (probably obvious?):

> 2015-10-13  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	* gcc/testsuite/gcc.target/aarch64/table-intrinsics.c: Fix regexp
> 	after r228716 (Fix vtbl[34] and vtbx4).

Bad luck. This is fine (and yes, obvious).

Thanks,
James

> Index: gcc/testsuite/gcc.target/aarch64/table-intrinsics.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/table-intrinsics.c	(revision 228759)
> +++ gcc/testsuite/gcc.target/aarch64/table-intrinsics.c	(working copy)
> @@ -435,5 +435,5 @@
>    return vqtbx4q_p8 (r, tab, idx);
>  }
>  
> -/* { dg-final { scan-assembler-times "tbl v" 42} }  */
> -/* { dg-final { scan-assembler-times "tbx v" 30} }  */
> +/* { dg-final { scan-assembler-times "tbl\[ |\t\]*v" 42} }  */
> +/* { dg-final { scan-assembler-times "tbx\[ |\t\]*v" 30} }  */

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-10-13 13:08 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-15 16:25 [AArch64_be] Fix vtbl[34] and vtbx4 Christophe Lyon
2015-09-29 21:26 ` Christophe Lyon
2015-10-07  9:24   ` Christophe Lyon
2015-10-07 15:09 ` James Greenhalgh
2015-10-07 20:07   ` Christophe Lyon
2015-10-08  9:12     ` James Greenhalgh
2015-10-09 16:16       ` Christophe Lyon
2015-10-12 13:30         ` James Greenhalgh
2015-10-13 13:05           ` Christophe Lyon
2015-10-13 13:08             ` James Greenhalgh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).