* [AArch64_be] Fix vtbl[34] and vtbx4
@ 2015-09-15 16:25 Christophe Lyon
2015-09-29 21:26 ` Christophe Lyon
2015-10-07 15:09 ` James Greenhalgh
0 siblings, 2 replies; 10+ messages in thread
From: Christophe Lyon @ 2015-09-15 16:25 UTC (permalink / raw)
To: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 225 bytes --]
This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
existing builtins, and fixes the behaviour on aarch64_be.
Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
OK?
Christophe.
[-- Attachment #2: vtbX.txt --]
[-- Type: text/plain, Size: 542 bytes --]
2015-09-15 Christophe Lyon <christophe.lyon@linaro.org>
* config/aarch64/aarch64-builtins.c
(aarch64_types_tbl_qualifiers): New static data.
(TYPES_TBL): Define.
* config/aarch64/aarch64-simd-builtins.def: Update builtins
tables.
* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
(vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
(vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
intrinsics.
* config/aarch64/iterators.md (V8Q): New.
[-- Attachment #3: vtbX.patch --]
[-- Type: text/x-patch, Size: 9898 bytes --]
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 0f4f2b9..7ca3917 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
qualifier_none, qualifier_struct_load_store_lane_index };
#define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_none, qualifier_none };
+#define TYPES_TBL (aarch64_types_tbl_qualifiers)
+
#define CF0(N, X) CODE_FOR_aarch64_##N##X
#define CF1(N, X) CODE_FOR_##N##X##1
#define CF2(N, X) CODE_FOR_##N##X##2
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d0f298a..62f1b13 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -405,3 +405,5 @@
VAR1 (BINOPP, crypto_pmull, 0, di)
VAR1 (BINOPP, crypto_pmull, 0, v2di)
+ /* Implemented by aarch64_tbl3v8qi. */
+ BUILTIN_V8Q (TBL, tbl3, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9777418..84a61d5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4716,6 +4714,16 @@
[(set_attr "type" "neon_tbl2_q")]
)
+(define_insn "aarch64_tbl3v8qi"
+ [(set (match_operand:V8QI 0 "register_operand" "=w")
+ (unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
+ (match_operand:V8QI 2 "register_operand" "w")]
+ UNSPEC_TBL))]
+ "TARGET_SIMD"
+ "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
+ [(set_attr "type" "neon_tbl3")]
+)
+
(define_insn_and_split "aarch64_combinev16qi"
[(set (match_operand:OI 0 "register_operand" "=w")
(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 87bbf6e..91704de 100644
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6dfebe7..e8ee318 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10902,13 +10902,14 @@ vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
{
int8x8_t result;
int8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = __builtin_aarch64_tbl3v8qi (__o, idx);
return result;
}
@@ -10917,13 +10918,14 @@ vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
{
uint8x8_t result;
uint8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -10932,13 +10934,14 @@ vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
{
poly8x8_t result;
poly8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -10947,13 +10950,14 @@ vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
{
int8x8_t result;
int8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = __builtin_aarch64_tbl3v8qi (__o, idx);
return result;
}
@@ -10962,13 +10966,14 @@ vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
{
uint8x8_t result;
uint8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -10977,13 +10982,14 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
{
poly8x8_t result;
poly8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -11023,51 +11029,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
return result;
}
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
-{
- int8x8_t result = r;
- int8x16x2_t temp;
- temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
- temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "+w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
-{
- uint8x8_t result = r;
- uint8x16x2_t temp;
- temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
- temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "+w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
- return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
-{
- poly8x8_t result = r;
- poly8x16x2_t temp;
- temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
- temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "+w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
- return result;
-}
-
/* End of temporary inline asm. */
/* Start of optimal implementations in approved order. */
@@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
return vbsl_p8 (__mask, __tbl, __r);
}
+/* vtbx4 */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
+{
+ uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+ vmov_n_u8 (32));
+ int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
+
+ return vbsl_s8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
+{
+ uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
+ uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
+
+ return vbsl_u8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
+{
+ uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
+ poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
+
+ return vbsl_p8 (__mask, __tbl, __r);
+}
+
/* vtrn */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b8a45d1..dfbd9cd 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -100,6 +100,8 @@
;; All modes.
(define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
+(define_mode_iterator V8Q [V8QI])
+
;; All vector modes and DI.
(define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-09-15 16:25 [AArch64_be] Fix vtbl[34] and vtbx4 Christophe Lyon
@ 2015-09-29 21:26 ` Christophe Lyon
2015-10-07 9:24 ` Christophe Lyon
2015-10-07 15:09 ` James Greenhalgh
1 sibling, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-09-29 21:26 UTC (permalink / raw)
To: gcc-patches
Ping?
On 15 September 2015 at 18:25, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
> existing builtins, and fixes the behaviour on aarch64_be.
>
> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>
> OK?
>
> Christophe.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-09-29 21:26 ` Christophe Lyon
@ 2015-10-07 9:24 ` Christophe Lyon
0 siblings, 0 replies; 10+ messages in thread
From: Christophe Lyon @ 2015-10-07 9:24 UTC (permalink / raw)
To: gcc-patches
Ping?
https://gcc.gnu.org/ml/gcc-patches/2015-09/msg01096.html
On 29 September 2015 at 22:57, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
> Ping?
>
>
> On 15 September 2015 at 18:25, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
>> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
>> existing builtins, and fixes the behaviour on aarch64_be.
>>
>> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>>
>> OK?
>>
>> Christophe.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-09-15 16:25 [AArch64_be] Fix vtbl[34] and vtbx4 Christophe Lyon
2015-09-29 21:26 ` Christophe Lyon
@ 2015-10-07 15:09 ` James Greenhalgh
2015-10-07 20:07 ` Christophe Lyon
1 sibling, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2015-10-07 15:09 UTC (permalink / raw)
To: Christophe Lyon; +Cc: gcc-patches
On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
> existing builtins, and fixes the behaviour on aarch64_be.
>
> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>
> OK?
Hi Christophe,
Sorry for the delay getting back to you, comments below.
> 2015-09-15 Christophe Lyon <christophe.lyon@linaro.org>
>
> * config/aarch64/aarch64-builtins.c
> (aarch64_types_tbl_qualifiers): New static data.
> (TYPES_TBL): Define.
> * config/aarch64/aarch64-simd-builtins.def: Update builtins
> tables.
> * config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
> * config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
> (vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
> (vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
> intrinsics.
> * config/aarch64/iterators.md (V8Q): New.
> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index 0f4f2b9..7ca3917 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
> qualifier_none, qualifier_struct_load_store_lane_index };
> #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
>
> +static enum aarch64_type_qualifiers
> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
> + = { qualifier_none, qualifier_none, qualifier_none };
> +#define TYPES_TBL (aarch64_types_tbl_qualifiers)
> +
Do we need these? This looks like TYPES_BINOP (the predicate on the
instruction pattern will prevent the "qualifier_maybe_immediate" from
becoming a problem).
> #define CF0(N, X) CODE_FOR_aarch64_##N##X
> #define CF1(N, X) CODE_FOR_##N##X##1
> #define CF2(N, X) CODE_FOR_##N##X##2
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index d0f298a..62f1b13 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -405,3 +405,5 @@
> VAR1 (BINOPP, crypto_pmull, 0, di)
> VAR1 (BINOPP, crypto_pmull, 0, v2di)
>
> + /* Implemented by aarch64_tbl3v8qi. */
> + BUILTIN_V8Q (TBL, tbl3, 0)
This can be:
VAR1 (BINOP, tbl3, 0, v8qi)
It would be good if we could eliminate the casts in arm_neon.h by also
defining a "BINOPU" version of this, but I imagine that gets stuck on the
types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about
making that change.
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 9777418..84a61d5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4716,6 +4714,16 @@
> [(set_attr "type" "neon_tbl2_q")]
> )
>
> +(define_insn "aarch64_tbl3v8qi"
> + [(set (match_operand:V8QI 0 "register_operand" "=w")
> + (unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
> + (match_operand:V8QI 2 "register_operand" "w")]
> + UNSPEC_TBL))]
> + "TARGET_SIMD"
> + "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
> + [(set_attr "type" "neon_tbl3")]
> +)
> +
> (define_insn_and_split "aarch64_combinev16qi"
> [(set (match_operand:OI 0 "register_operand" "=w")
> (unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 87bbf6e..91704de 100644
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 6dfebe7..e8ee318 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> /* End of temporary inline asm. */
>
> /* Start of optimal implementations in approved order. */
> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
> return vbsl_p8 (__mask, __tbl, __r);
> }
>
> +/* vtbx4 */
> +
> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
> +{
> + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
> + vmov_n_u8 (32));
> + int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
> +
> + return vbsl_s8 (__mask, __tbl, __r);
> +}
> +
> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
> +{
> + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
> + uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
> +
> + return vbsl_u8 (__mask, __tbl, __r);
> +}
> +
> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
> +{
> + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
> + poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
> +
> + return vbsl_p8 (__mask, __tbl, __r);
> +}
> +
Why do we want this for vtbx4 rather than putting out a VTBX instruction
directly (as in the inline asm versions you replace)?
This sequence does make sense for vtbx3.
> /* vtrn */
>
> __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index b8a45d1..dfbd9cd 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -100,6 +100,8 @@
> ;; All modes.
> (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>
> +(define_mode_iterator V8Q [V8QI])
> +
This can be dropped if you use VAR1 in aarch64-builtins.c.
Thanks for working on this, with your patch applied, the only
remaining intrinsics I see failing for aarch64_be are:
vqtbl2_*8
vqtbl2q_*8
vqtbl3_*8
vqtbl3q_*8
vqtbl4_*8
vqtbl4q_*8
vqtbx2_*8
vqtbx2q_*8
vqtbx3_*8
vqtbx3q_*8
vqtbx4_*8
vqtbx4q_*8
Thanks,
James
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-10-07 15:09 ` James Greenhalgh
@ 2015-10-07 20:07 ` Christophe Lyon
2015-10-08 9:12 ` James Greenhalgh
0 siblings, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-10-07 20:07 UTC (permalink / raw)
To: James Greenhalgh; +Cc: gcc-patches
On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
>> existing builtins, and fixes the behaviour on aarch64_be.
>>
>> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>>
>> OK?
>
> Hi Christophe,
>
> Sorry for the delay getting back to you, comments below.
>
>> 2015-09-15 Christophe Lyon <christophe.lyon@linaro.org>
>>
>> * config/aarch64/aarch64-builtins.c
>> (aarch64_types_tbl_qualifiers): New static data.
>> (TYPES_TBL): Define.
>> * config/aarch64/aarch64-simd-builtins.def: Update builtins
>> tables.
>> * config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
>> * config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
>> (vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
>> (vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
>> intrinsics.
>> * config/aarch64/iterators.md (V8Q): New.
>
>> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
>> index 0f4f2b9..7ca3917 100644
>> --- a/gcc/config/aarch64/aarch64-builtins.c
>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>> @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>> qualifier_none, qualifier_struct_load_store_lane_index };
>> #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
>>
>> +static enum aarch64_type_qualifiers
>> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>> + = { qualifier_none, qualifier_none, qualifier_none };
>> +#define TYPES_TBL (aarch64_types_tbl_qualifiers)
>> +
>
> Do we need these? This looks like TYPES_BINOP (the predicate on the
> instruction pattern will prevent the "qualifier_maybe_immediate" from
> becoming a problem).
>
I'll give it a try, indeed I feared "qualifier_maybe_immediate" would
cause problems.
>> #define CF0(N, X) CODE_FOR_aarch64_##N##X
>> #define CF1(N, X) CODE_FOR_##N##X##1
>> #define CF2(N, X) CODE_FOR_##N##X##2
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index d0f298a..62f1b13 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -405,3 +405,5 @@
>> VAR1 (BINOPP, crypto_pmull, 0, di)
>> VAR1 (BINOPP, crypto_pmull, 0, v2di)
>>
>> + /* Implemented by aarch64_tbl3v8qi. */
>> + BUILTIN_V8Q (TBL, tbl3, 0)
>
> This can be:
>
> VAR1 (BINOP, tbl3, 0, v8qi)
>
> It would be good if we could eliminate the casts in arm_neon.h by also
> defining a "BINOPU" version of this, but I imagine that gets stuck on the
> types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about
> making that change.
OK
>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index 9777418..84a61d5 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -4716,6 +4714,16 @@
>> [(set_attr "type" "neon_tbl2_q")]
>> )
>>
>> +(define_insn "aarch64_tbl3v8qi"
>> + [(set (match_operand:V8QI 0 "register_operand" "=w")
>> + (unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
>> + (match_operand:V8QI 2 "register_operand" "w")]
>> + UNSPEC_TBL))]
>> + "TARGET_SIMD"
>> + "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
>> + [(set_attr "type" "neon_tbl3")]
>> +)
>> +
>> (define_insn_and_split "aarch64_combinev16qi"
>> [(set (match_operand:OI 0 "register_operand" "=w")
>> (unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 87bbf6e..91704de 100644
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index 6dfebe7..e8ee318 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>> /* End of temporary inline asm. */
>>
>> /* Start of optimal implementations in approved order. */
>> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
>> return vbsl_p8 (__mask, __tbl, __r);
>> }
>>
>> +/* vtbx4 */
>> +
>> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
>> +{
>> + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
>> + vmov_n_u8 (32));
>> + int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
>> +
>> + return vbsl_s8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> + uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
>> +
>> + return vbsl_u8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> + poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
>> +
>> + return vbsl_p8 (__mask, __tbl, __r);
>> +}
>> +
>
> Why do we want this for vtbx4 rather than putting out a VTBX instruction
> directly (as in the inline asm versions you replace)?
>
I just followed the pattern used for vtbx3.
> This sequence does make sense for vtbx3.
In fact, I don't see why vtbx3 and vtbx4 should be different?
>> /* vtrn */
>>
>> __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> index b8a45d1..dfbd9cd 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -100,6 +100,8 @@
>> ;; All modes.
>> (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>>
>> +(define_mode_iterator V8Q [V8QI])
>> +
>
> This can be dropped if you use VAR1 in aarch64-builtins.c.
>
> Thanks for working on this, with your patch applied, the only
> remaining intrinsics I see failing for aarch64_be are:
>
> vqtbl2_*8
> vqtbl2q_*8
> vqtbl3_*8
> vqtbl3q_*8
> vqtbl4_*8
> vqtbl4q_*8
>
> vqtbx2_*8
> vqtbx2q_*8
> vqtbx3_*8
> vqtbx3q_*8
> vqtbx4_*8
> vqtbx4q_*8
>
Quite possibly. Which tests are you looking at? Since these are
aarch64-specific, they are not part of the
tests I added (advsimd-intrinsics). Do you mean
gcc.target/aarch64/table-intrinsics.c?
> Thanks,
> James
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-10-07 20:07 ` Christophe Lyon
@ 2015-10-08 9:12 ` James Greenhalgh
2015-10-09 16:16 ` Christophe Lyon
0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2015-10-08 9:12 UTC (permalink / raw)
To: Christophe Lyon; +Cc: gcc-patches
On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> >
> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
> > directly (as in the inline asm versions you replace)?
> >
> I just followed the pattern used for vtbx3.
>
> > This sequence does make sense for vtbx3.
> In fact, I don't see why vtbx3 and vtbx4 should be different?
The difference between TBL and TBX is in their handling of a request to
select an out-of-range value. For TBL this returns zero, for TBX this
returns the value which was already in the destination register.
Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
(so two of them togather allow selecting elements in the range 0-31), and
vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
vectors (allowing elements in the range 0-23), we need to manually check for
values which would have been out-of-range on AArch32, but are not out
of range for AArch64 and handle them appropriately. For vtbx4 on the other
hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
the range 0..31, so we don't need the special masked handling.
You can find the suggested instruction sequences for the Neon intrinsics
in this document:
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
> >> /* vtrn */
> >>
> >> __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> >> index b8a45d1..dfbd9cd 100644
> >> --- a/gcc/config/aarch64/iterators.md
> >> +++ b/gcc/config/aarch64/iterators.md
> >> @@ -100,6 +100,8 @@
> >> ;; All modes.
> >> (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
> >>
> >> +(define_mode_iterator V8Q [V8QI])
> >> +
> >
> > This can be dropped if you use VAR1 in aarch64-builtins.c.
> >
> > Thanks for working on this, with your patch applied, the only
> > remaining intrinsics I see failing for aarch64_be are:
> >
> > vqtbl2_*8
> > vqtbl2q_*8
> > vqtbl3_*8
> > vqtbl3q_*8
> > vqtbl4_*8
> > vqtbl4q_*8
> >
> > vqtbx2_*8
> > vqtbx2q_*8
> > vqtbx3_*8
> > vqtbx3q_*8
> > vqtbx4_*8
> > vqtbx4q_*8
> >
> Quite possibly. Which tests are you looking at? Since these are
> aarch64-specific, they are not part of the
> tests I added (advsimd-intrinsics). Do you mean
> gcc.target/aarch64/table-intrinsics.c?
Sorry, yes I should have given a reference. I'm running with a variant of
a testcase from the LLVM test-suite repository:
SingleSource/UnitTests/Vector/AArch64/aarch64_neon_intrinsics.c
This has an execute test for most of the intrinsics specified for AArch64.
It needs some modification to cover the intrinsics we don't implement yet.
Thanks,
James
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-10-08 9:12 ` James Greenhalgh
@ 2015-10-09 16:16 ` Christophe Lyon
2015-10-12 13:30 ` James Greenhalgh
0 siblings, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-10-09 16:16 UTC (permalink / raw)
To: James Greenhalgh; +Cc: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 3335 bytes --]
On 8 October 2015 at 11:12, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
>> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> >
>> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
>> > directly (as in the inline asm versions you replace)?
>> >
>> I just followed the pattern used for vtbx3.
>>
>> > This sequence does make sense for vtbx3.
>> In fact, I don't see why vtbx3 and vtbx4 should be different?
>
> The difference between TBL and TBX is in their handling of a request to
> select an out-of-range value. For TBL this returns zero, for TBX this
> returns the value which was already in the destination register.
>
> Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
> (so two of them togather allow selecting elements in the range 0-31), and
> vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
> vectors (allowing elements in the range 0-23), we need to manually check for
> values which would have been out-of-range on AArch32, but are not out
> of range for AArch64 and handle them appropriately. For vtbx4 on the other
> hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
> the range 0..31, so we don't need the special masked handling.
>
> You can find the suggested instruction sequences for the Neon intrinsics
> in this document:
>
> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>
Hi James,
Please find attached an updated version which hopefully addresses your comments.
Tested on aarch64-none-elf and aarch64_be-none-elf using the Foundation Model.
OK?
Christophe.
>> >> /* vtrn */
>> >>
>> >> __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>> >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> >> index b8a45d1..dfbd9cd 100644
>> >> --- a/gcc/config/aarch64/iterators.md
>> >> +++ b/gcc/config/aarch64/iterators.md
>> >> @@ -100,6 +100,8 @@
>> >> ;; All modes.
>> >> (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>> >>
>> >> +(define_mode_iterator V8Q [V8QI])
>> >> +
>> >
>> > This can be dropped if you use VAR1 in aarch64-builtins.c.
>> >
>> > Thanks for working on this, with your patch applied, the only
>> > remaining intrinsics I see failing for aarch64_be are:
>> >
>> > vqtbl2_*8
>> > vqtbl2q_*8
>> > vqtbl3_*8
>> > vqtbl3q_*8
>> > vqtbl4_*8
>> > vqtbl4q_*8
>> >
>> > vqtbx2_*8
>> > vqtbx2q_*8
>> > vqtbx3_*8
>> > vqtbx3q_*8
>> > vqtbx4_*8
>> > vqtbx4q_*8
>> >
>> Quite possibly. Which tests are you looking at? Since these are
>> aarch64-specific, they are not part of the
>> tests I added (advsimd-intrinsics). Do you mean
>> gcc.target/aarch64/table-intrinsics.c?
>
> Sorry, yes I should have given a reference. I'm running with a variant of
> a testcase from the LLVM test-suite repository:
>
> SingleSource/UnitTests/Vector/AArch64/aarch64_neon_intrinsics.c
>
> This has an execute test for most of the intrinsics specified for AArch64.
> It needs some modification to cover the intrinsics we don't implement yet.
>
> Thanks,
> James
>
[-- Attachment #2: vtbX.txt --]
[-- Type: text/plain, Size: 440 bytes --]
2015-10-09 Christophe Lyon <christophe.lyon@linaro.org>
* config/aarch64/aarch64-simd-builtins.def: Update builtins
tables: add tbl3 and tbx4.
* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
(aarch64_tbx4v8qi): New.
* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
(vtbl4_s8, vtbl4_u8, vtbl4_p8, vtbx4_s8, vtbx4_u8, vtbx4_p8):
Rewrite using builtin functions.
* config/aarch64/iterators.md (UNSPEC_TBX): New.
[-- Attachment #3: vtbX.patch --]
[-- Type: text/x-patch, Size: 10559 bytes --]
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d0f298a..c16e82c9 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -405,3 +405,8 @@
VAR1 (BINOPP, crypto_pmull, 0, di)
VAR1 (BINOPP, crypto_pmull, 0, v2di)
+ /* Implemented by aarch64_tbl3v8qi. */
+ VAR1 (BINOP, tbl3, 0, v8qi)
+
+ /* Implemented by aarch64_tbx4v8qi. */
+ VAR1 (TERNOP, tbx4, 0, v8qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9777418..6027582 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4716,6 +4714,27 @@
[(set_attr "type" "neon_tbl2_q")]
)
+(define_insn "aarch64_tbl3v8qi"
+ [(set (match_operand:V8QI 0 "register_operand" "=w")
+ (unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
+ (match_operand:V8QI 2 "register_operand" "w")]
+ UNSPEC_TBL))]
+ "TARGET_SIMD"
+ "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
+ [(set_attr "type" "neon_tbl3")]
+)
+
+(define_insn "aarch64_tbx4v8qi"
+ [(set (match_operand:V8QI 0 "register_operand" "=w")
+ (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0")
+ (match_operand:OI 2 "register_operand" "w")
+ (match_operand:V8QI 3 "register_operand" "w")]
+ UNSPEC_TBX))]
+ "TARGET_SIMD"
+ "tbx\\t%S0.8b, {%S2.16b - %T2.16b}, %S3.8b"
+ [(set_attr "type" "neon_tbl4")]
+)
+
(define_insn_and_split "aarch64_combinev16qi"
[(set (match_operand:OI 0 "register_operand" "=w")
(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6dfebe7..e99819e 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10902,13 +10902,14 @@ vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
{
int8x8_t result;
int8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = __builtin_aarch64_tbl3v8qi (__o, idx);
return result;
}
@@ -10917,13 +10918,14 @@ vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
{
uint8x8_t result;
uint8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -10932,13 +10934,14 @@ vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
{
poly8x8_t result;
poly8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -10947,13 +10950,14 @@ vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
{
int8x8_t result;
int8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = __builtin_aarch64_tbl3v8qi (__o, idx);
return result;
}
@@ -10962,13 +10966,14 @@ vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
{
uint8x8_t result;
uint8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -10977,13 +10982,14 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
{
poly8x8_t result;
poly8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "=w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
@@ -11023,51 +11029,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
return result;
}
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
-{
- int8x8_t result = r;
- int8x16x2_t temp;
- temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
- temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "+w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
-{
- uint8x8_t result = r;
- uint8x16x2_t temp;
- temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
- temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "+w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
- return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
-{
- poly8x8_t result = r;
- poly8x16x2_t temp;
- temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
- temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
- __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
- "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
- : "+w"(result)
- : "Q"(temp), "w"(idx)
- : "v16", "v17", "memory");
- return result;
-}
-
/* End of temporary inline asm. */
/* Start of optimal implementations in approved order. */
@@ -23221,6 +23182,58 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
return vbsl_p8 (__mask, __tbl, __r);
}
+/* vtbx4 */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
+{
+ int8x8_t result;
+ int8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
+ temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
+ temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
+ return result;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
+{
+ uint8x8_t result;
+ uint8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
+ temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
+ temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+ (int8x8_t)__idx);
+ return result;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
+{
+ poly8x8_t result;
+ poly8x16x2_t temp;
+ __builtin_aarch64_simd_oi __o;
+ temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
+ temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o,
+ (int8x16_t) temp.val[1], 1);
+ result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+ (int8x8_t)__idx);
+ return result;
+}
+
/* vtrn */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b8a45d1..d856117 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -253,6 +253,7 @@
UNSPEC_USHLL ; Used in aarch64-simd.md.
UNSPEC_ADDP ; Used in aarch64-simd.md.
UNSPEC_TBL ; Used in vector permute patterns.
+ UNSPEC_TBX ; Used in vector permute patterns.
UNSPEC_CONCAT ; Used in vector permute patterns.
UNSPEC_ZIP1 ; Used in vector permute patterns.
UNSPEC_ZIP2 ; Used in vector permute patterns.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-10-09 16:16 ` Christophe Lyon
@ 2015-10-12 13:30 ` James Greenhalgh
2015-10-13 13:05 ` Christophe Lyon
0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2015-10-12 13:30 UTC (permalink / raw)
To: Christophe Lyon; +Cc: gcc-patches
On Fri, Oct 09, 2015 at 05:16:05PM +0100, Christophe Lyon wrote:
> On 8 October 2015 at 11:12, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> > On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
> >> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> >> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> >> >
> >> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
> >> > directly (as in the inline asm versions you replace)?
> >> >
> >> I just followed the pattern used for vtbx3.
> >>
> >> > This sequence does make sense for vtbx3.
> >> In fact, I don't see why vtbx3 and vtbx4 should be different?
> >
> > The difference between TBL and TBX is in their handling of a request to
> > select an out-of-range value. For TBL this returns zero, for TBX this
> > returns the value which was already in the destination register.
> >
> > Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
> > (so two of them togather allow selecting elements in the range 0-31), and
> > vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
> > vectors (allowing elements in the range 0-23), we need to manually check for
> > values which would have been out-of-range on AArch32, but are not out
> > of range for AArch64 and handle them appropriately. For vtbx4 on the other
> > hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
> > the range 0..31, so we don't need the special masked handling.
> >
> > You can find the suggested instruction sequences for the Neon intrinsics
> > in this document:
> >
> > http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
> >
>
> Hi James,
>
> Please find attached an updated version which hopefully addresses your comments.
> Tested on aarch64-none-elf and aarch64_be-none-elf using the Foundation Model.
>
> OK?
Looks good to me,
Thanks,
James
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-10-12 13:30 ` James Greenhalgh
@ 2015-10-13 13:05 ` Christophe Lyon
2015-10-13 13:08 ` James Greenhalgh
0 siblings, 1 reply; 10+ messages in thread
From: Christophe Lyon @ 2015-10-13 13:05 UTC (permalink / raw)
To: James Greenhalgh; +Cc: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 2411 bytes --]
On 12 October 2015 at 15:30, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Fri, Oct 09, 2015 at 05:16:05PM +0100, Christophe Lyon wrote:
>> On 8 October 2015 at 11:12, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>> > On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
>> >> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>> >> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> >> >
>> >> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
>> >> > directly (as in the inline asm versions you replace)?
>> >> >
>> >> I just followed the pattern used for vtbx3.
>> >>
>> >> > This sequence does make sense for vtbx3.
>> >> In fact, I don't see why vtbx3 and vtbx4 should be different?
>> >
>> > The difference between TBL and TBX is in their handling of a request to
>> > select an out-of-range value. For TBL this returns zero, for TBX this
>> > returns the value which was already in the destination register.
>> >
>> > Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
>> > (so two of them togather allow selecting elements in the range 0-31), and
>> > vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
>> > vectors (allowing elements in the range 0-23), we need to manually check for
>> > values which would have been out-of-range on AArch32, but are not out
>> > of range for AArch64 and handle them appropriately. For vtbx4 on the other
>> > hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
>> > the range 0..31, so we don't need the special masked handling.
>> >
>> > You can find the suggested instruction sequences for the Neon intrinsics
>> > in this document:
>> >
>> > http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
>> >
>>
>> Hi James,
>>
>> Please find attached an updated version which hopefully addresses your comments.
>> Tested on aarch64-none-elf and aarch64_be-none-elf using the Foundation Model.
>>
>> OK?
>
> Looks good to me,
>
> Thanks,
> James
>
I commited this as r228716, and noticed later that
gcc.target/aarch64/table-intrinsics.c failed because of this patch.
This is because that testcase scans the assembly for 'tbl v' or 'tbx
v', but since I replaced some asm statements,
the space is now a tab.
I plan to commit this (probably obvious?):
[-- Attachment #2: table-intr.txt --]
[-- Type: text/plain, Size: 168 bytes --]
2015-10-13 Christophe Lyon <christophe.lyon@linaro.org>
* gcc/testsuite/gcc.target/aarch64/table-intrinsics.c: Fix regexp
after r228716 (Fix vtbl[34] and vtbx4).
[-- Attachment #3: table-intr.patch --]
[-- Type: text/x-patch, Size: 573 bytes --]
Index: gcc/testsuite/gcc.target/aarch64/table-intrinsics.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/table-intrinsics.c (revision 228759)
+++ gcc/testsuite/gcc.target/aarch64/table-intrinsics.c (working copy)
@@ -435,5 +435,5 @@
return vqtbx4q_p8 (r, tab, idx);
}
-/* { dg-final { scan-assembler-times "tbl v" 42} } */
-/* { dg-final { scan-assembler-times "tbx v" 30} } */
+/* { dg-final { scan-assembler-times "tbl\[ |\t\]*v" 42} } */
+/* { dg-final { scan-assembler-times "tbx\[ |\t\]*v" 30} } */
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [AArch64_be] Fix vtbl[34] and vtbx4
2015-10-13 13:05 ` Christophe Lyon
@ 2015-10-13 13:08 ` James Greenhalgh
0 siblings, 0 replies; 10+ messages in thread
From: James Greenhalgh @ 2015-10-13 13:08 UTC (permalink / raw)
To: Christophe Lyon; +Cc: gcc-patches
On Tue, Oct 13, 2015 at 02:05:01PM +0100, Christophe Lyon wrote:
> I commited this as r228716, and noticed later that
> gcc.target/aarch64/table-intrinsics.c failed because of this patch.
>
> This is because that testcase scans the assembly for 'tbl v' or 'tbx
> v', but since I replaced some asm statements,
> the space is now a tab.
>
> I plan to commit this (probably obvious?):
> 2015-10-13 Christophe Lyon <christophe.lyon@linaro.org>
>
> * gcc/testsuite/gcc.target/aarch64/table-intrinsics.c: Fix regexp
> after r228716 (Fix vtbl[34] and vtbx4).
Bad luck. This is fine (and yes, obvious).
Thanks,
James
> Index: gcc/testsuite/gcc.target/aarch64/table-intrinsics.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/table-intrinsics.c (revision 228759)
> +++ gcc/testsuite/gcc.target/aarch64/table-intrinsics.c (working copy)
> @@ -435,5 +435,5 @@
> return vqtbx4q_p8 (r, tab, idx);
> }
>
> -/* { dg-final { scan-assembler-times "tbl v" 42} } */
> -/* { dg-final { scan-assembler-times "tbx v" 30} } */
> +/* { dg-final { scan-assembler-times "tbl\[ |\t\]*v" 42} } */
> +/* { dg-final { scan-assembler-times "tbx\[ |\t\]*v" 30} } */
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2015-10-13 13:08 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-15 16:25 [AArch64_be] Fix vtbl[34] and vtbx4 Christophe Lyon
2015-09-29 21:26 ` Christophe Lyon
2015-10-07 9:24 ` Christophe Lyon
2015-10-07 15:09 ` James Greenhalgh
2015-10-07 20:07 ` Christophe Lyon
2015-10-08 9:12 ` James Greenhalgh
2015-10-09 16:16 ` Christophe Lyon
2015-10-12 13:30 ` James Greenhalgh
2015-10-13 13:05 ` Christophe Lyon
2015-10-13 13:08 ` James Greenhalgh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).