diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index d0f298a..c16e82c9 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -405,3 +405,8 @@ VAR1 (BINOPP, crypto_pmull, 0, di) VAR1 (BINOPP, crypto_pmull, 0, v2di) + /* Implemented by aarch64_tbl3v8qi. */ + VAR1 (BINOP, tbl3, 0, v8qi) + + /* Implemented by aarch64_tbx4v8qi. */ + VAR1 (TERNOP, tbx4, 0, v8qi) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 9777418..6027582 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4716,6 +4714,27 @@ [(set_attr "type" "neon_tbl2_q")] ) +(define_insn "aarch64_tbl3v8qi" + [(set (match_operand:V8QI 0 "register_operand" "=w") + (unspec:V8QI [(match_operand:OI 1 "register_operand" "w") + (match_operand:V8QI 2 "register_operand" "w")] + UNSPEC_TBL))] + "TARGET_SIMD" + "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b" + [(set_attr "type" "neon_tbl3")] +) + +(define_insn "aarch64_tbx4v8qi" + [(set (match_operand:V8QI 0 "register_operand" "=w") + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0") + (match_operand:OI 2 "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "w")] + UNSPEC_TBX))] + "TARGET_SIMD" + "tbx\\t%S0.8b, {%S2.16b - %T2.16b}, %S3.8b" + [(set_attr "type" "neon_tbl4")] +) + (define_insn_and_split "aarch64_combinev16qi" [(set (match_operand:OI 0 "register_operand" "=w") (unspec:OI [(match_operand:V16QI 1 "register_operand" "w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 6dfebe7..e99819e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -10902,13 +10902,14 @@ vtbl3_s8 (int8x8x3_t tab, int8x8_t idx) { int8x8_t result; int8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = __builtin_aarch64_tbl3v8qi (__o, idx); return result; } @@ -10917,13 +10918,14 @@ vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -10932,13 +10934,14 @@ vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -10947,13 +10950,14 @@ vtbl4_s8 (int8x8x4_t tab, int8x8_t idx) { int8x8_t result; int8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = __builtin_aarch64_tbl3v8qi (__o, idx); return result; } @@ -10962,13 +10966,14 @@ vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -10977,13 +10982,14 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -11023,51 +11029,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx) -{ - int8x8_t result = r; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx) -{ - uint8x8_t result = r; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx) -{ - poly8x8_t result = r; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; -} - /* End of temporary inline asm. */ /* Start of optimal implementations in approved order. */ @@ -23221,6 +23182,58 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) return vbsl_p8 (__mask, __tbl, __r); } +/* vtbx4 */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) +{ + int8x8_t result; + int8x16x2_t temp; + __builtin_aarch64_simd_oi __o; + temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); + temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) +{ + uint8x8_t result; + uint8x16x2_t temp; + __builtin_aarch64_simd_oi __o; + temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); + temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) +{ + poly8x8_t result; + poly8x16x2_t temp; + __builtin_aarch64_simd_oi __o; + temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); + temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); + return result; +} + /* vtrn */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index b8a45d1..d856117 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -253,6 +253,7 @@ UNSPEC_USHLL ; Used in aarch64-simd.md. UNSPEC_ADDP ; Used in aarch64-simd.md. UNSPEC_TBL ; Used in vector permute patterns. + UNSPEC_TBX ; Used in vector permute patterns. UNSPEC_CONCAT ; Used in vector permute patterns. UNSPEC_ZIP1 ; Used in vector permute patterns. UNSPEC_ZIP2 ; Used in vector permute patterns.