diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ab84257..7b72ead 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -236,4 +236,9 @@ rtx aarch64_expand_builtin (tree exp, int ignore ATTRIBUTE_UNUSED); tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED); +extern void aarch64_split_combinev16qi (rtx operands[3]); +extern void aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel); +extern bool +aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel); + #endif /* GCC_AARCH64_PROTOS_H */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index b3d01c1..2b0c8d6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3298,6 +3298,74 @@ ;; Permuted-store expanders for neon intrinsics. +;; Permute instructions + +;; vec_perm support + +(define_expand "vec_perm_const" + [(match_operand:VALL 0 "register_operand") + (match_operand:VALL 1 "register_operand") + (match_operand:VALL 2 "register_operand") + (match_operand: 3)] + "TARGET_SIMD" +{ + if (aarch64_expand_vec_perm_const (operands[0], operands[1], + operands[2], operands[3])) + DONE; + else + FAIL; +}) + +(define_expand "vec_perm" + [(match_operand:VB 0 "register_operand") + (match_operand:VB 1 "register_operand") + (match_operand:VB 2 "register_operand") + (match_operand:VB 3 "register_operand")] + "TARGET_SIMD" +{ + aarch64_expand_vec_perm (operands[0], operands[1], + operands[2], operands[3]); + DONE; +}) + +(define_insn "aarch64_tbl1" + [(set (match_operand:VB 0 "register_operand" "=w") + (unspec:VB [(match_operand:V16QI 1 "register_operand" "w") + (match_operand:VB 2 "register_operand" "w")] + UNSPEC_TBL))] + "TARGET_SIMD" + "tbl\\t%0., {%1.16b}, %2." + [(set_attr "simd_type" "simd_tbl") + (set_attr "simd_mode" "")] +) + +;; Two source registers. + +(define_insn "aarch64_tbl2v16qi" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI [(match_operand:OI 1 "register_operand" "w") + (match_operand:V16QI 2 "register_operand" "w")] + UNSPEC_TBL))] + "TARGET_SIMD" + "tbl\\t%0.16b, {%S1.16b - %T1.16b}, %2.16b" + [(set_attr "simd_type" "simd_tbl") + (set_attr "simd_mode" "V16QI")] +) + +(define_insn_and_split "aarch64_combinev16qi" + [(set (match_operand:OI 0 "register_operand" "=w") + (unspec:OI [(match_operand:V16QI 1 "register_operand" "w") + (match_operand:V16QI 2 "register_operand" "w")] + UNSPEC_CONCAT))] + "TARGET_SIMD" + "#" + "&& reload_completed" + [(const_int 0)] +{ + aarch64_split_combinev16qi (operands); + DONE; +}) + (define_insn "aarch64_st2_dreg" [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv") (unspec:TI [(match_operand:OI 1 "register_operand" "w") diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index f262ef9..cebc8cb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -110,6 +110,9 @@ static unsigned bit_count (unsigned HOST_WIDE_INT); static bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT, HOST_WIDE_INT); +static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode, + const unsigned char *sel); + /* The processor for which instructions should be scheduled. */ enum aarch64_processor aarch64_tune = generic; @@ -6678,6 +6681,292 @@ aarch64_c_mode_for_suffix (char suffix) return VOIDmode; } +/* Split operands into moves from op[1] + op[2] into op[0]. */ + +void +aarch64_split_combinev16qi (rtx operands[3]) +{ + unsigned int dest = REGNO (operands[0]); + unsigned int src1 = REGNO (operands[1]); + unsigned int src2 = REGNO (operands[2]); + enum machine_mode halfmode = GET_MODE (operands[1]); + unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode); + rtx destlo, desthi; + + gcc_assert (halfmode == V16QImode); + + if (src1 == dest && src2 == dest + halfregs) + { + /* No-op move. Can't split to nothing; emit something. */ + emit_note (NOTE_INSN_DELETED); + return; + } + + /* Preserve register attributes for variable tracking. */ + destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0); + desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs, + GET_MODE_SIZE (halfmode)); + + /* Special case of reversed high/low parts. */ + if (reg_overlap_mentioned_p (operands[2], destlo) + && reg_overlap_mentioned_p (operands[1], desthi)) + { + emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2])); + emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2])); + emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2])); + } + else if (!reg_overlap_mentioned_p (operands[2], destlo)) + { + /* Try to avoid unnecessary moves if part of the result + is in the right place already. */ + if (src1 != dest) + emit_move_insn (destlo, operands[1]); + if (src2 != dest + halfregs) + emit_move_insn (desthi, operands[2]); + } + else + { + if (src2 != dest + halfregs) + emit_move_insn (desthi, operands[2]); + if (src1 != dest) + emit_move_insn (destlo, operands[1]); + } +} + +/* vec_perm support. */ + +#define MAX_VECT_LEN 16 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool one_vector_p; + bool testing_p; +}; + +/* Generate a variable permutation. */ + +static void +aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel) +{ + enum machine_mode vmode = GET_MODE (target); + bool one_vector_p = rtx_equal_p (op0, op1); + + gcc_checking_assert (vmode == V8QImode || vmode == V16QImode); + gcc_checking_assert (GET_MODE (op0) == vmode); + gcc_checking_assert (GET_MODE (op1) == vmode); + gcc_checking_assert (GET_MODE (sel) == vmode); + gcc_checking_assert (TARGET_SIMD); + + if (one_vector_p) + { + if (vmode == V8QImode) + { + /* Expand the argument to a V16QI mode by duplicating it. */ + rtx pair = gen_reg_rtx (V16QImode); + emit_insn (gen_aarch64_combinev8qi (pair, op0, op0)); + emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); + } + else + { + emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel)); + } + } + else + { + rtx pair; + + if (vmode == V8QImode) + { + pair = gen_reg_rtx (V16QImode); + emit_insn (gen_aarch64_combinev8qi (pair, op0, op1)); + emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); + } + else + { + pair = gen_reg_rtx (OImode); + emit_insn (gen_aarch64_combinev16qi (pair, op0, op1)); + emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel)); + } + } +} + +void +aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) +{ + enum machine_mode vmode = GET_MODE (target); + unsigned int i, nelt = GET_MODE_NUNITS (vmode); + bool one_vector_p = rtx_equal_p (op0, op1); + rtx rmask[MAX_VECT_LEN], mask; + + gcc_checking_assert (!BYTES_BIG_ENDIAN); + + /* The TBL instruction does not use a modulo index, so we must take care + of that ourselves. */ + mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1); + for (i = 0; i < nelt; ++i) + rmask[i] = mask; + mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask)); + sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN); + + aarch64_expand_vec_perm_1 (target, op0, op1, sel); +} + +static bool +aarch64_evpc_tbl (struct expand_vec_perm_d *d) +{ + rtx rperm[MAX_VECT_LEN], sel; + enum machine_mode vmode = d->vmode; + unsigned int i, nelt = d->nelt; + + /* TODO: ARM's TBL indexing is little-endian. In order to handle GCC's + numbering of elements for big-endian, we must reverse the order. */ + if (BYTES_BIG_ENDIAN) + return false; + + if (d->testing_p) + return true; + + /* Generic code will try constant permutation twice. Once with the + original mode and again with the elements lowered to QImode. + So wait and don't do the selector expansion ourselves. */ + if (vmode != V8QImode && vmode != V16QImode) + return false; + + for (i = 0; i < nelt; ++i) + rperm[i] = GEN_INT (d->perm[i]); + sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); + sel = force_reg (vmode, sel); + + aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel); + return true; +} + +static bool +aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) +{ + /* The pattern matching functions above are written to look for a small + number to begin the sequence (0, 1, N/2). If we begin with an index + from the second operand, we can swap the operands. */ + if (d->perm[0] >= d->nelt) + { + unsigned i, nelt = d->nelt; + rtx x; + + for (i = 0; i < nelt; ++i) + d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1); + + x = d->op0; + d->op0 = d->op1; + d->op1 = x; + } + + if (TARGET_SIMD) + return aarch64_evpc_tbl (d); + return false; +} + +/* Expand a vec_perm_const pattern. */ + +bool +aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel) +{ + struct expand_vec_perm_d d; + int i, nelt, which; + + d.target = target; + d.op0 = op0; + d.op1 = op1; + + d.vmode = GET_MODE (target); + gcc_assert (VECTOR_MODE_P (d.vmode)); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = false; + + for (i = which = 0; i < nelt; ++i) + { + rtx e = XVECEXP (sel, 0, i); + int ei = INTVAL (e) & (2 * nelt - 1); + which |= (ei < nelt ? 1 : 2); + d.perm[i] = ei; + } + + switch (which) + { + default: + gcc_unreachable (); + + case 3: + d.one_vector_p = false; + if (!rtx_equal_p (op0, op1)) + break; + + /* The elements of PERM do not suggest that only the first operand + is used, but both operands are identical. Allow easier matching + of the permutation by folding the permutation into the single + input vector. */ + /* Fall Through. */ + case 2: + for (i = 0; i < nelt; ++i) + d.perm[i] &= nelt - 1; + d.op0 = op1; + d.one_vector_p = true; + break; + + case 1: + d.op1 = op0; + d.one_vector_p = true; + break; + } + + return aarch64_expand_vec_perm_const_1 (&d); +} + +static bool +aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode, + const unsigned char *sel) +{ + struct expand_vec_perm_d d; + unsigned int i, nelt, which; + bool ret; + + d.vmode = vmode; + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = true; + memcpy (d.perm, sel, nelt); + + /* Calculate whether all elements are in one vector. */ + for (i = which = 0; i < nelt; ++i) + { + unsigned char e = d.perm[i]; + gcc_assert (e < 2 * nelt); + which |= (e < nelt ? 1 : 2); + } + + /* If all elements are from the second vector, reindex as if from the + first vector. */ + if (which == 2) + for (i = 0; i < nelt; ++i) + d.perm[i] -= nelt; + + /* Check whether the mask can be applied to a single vector. */ + d.one_vector_p = (which != 3); + + d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); + d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); + if (!d.one_vector_p) + d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); + + start_sequence (); + ret = aarch64_expand_vec_perm_const_1 (&d); + end_sequence (); + + return ret; +} + #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST aarch64_address_cost @@ -6864,6 +7153,12 @@ aarch64_c_mode_for_suffix (char suffix) #undef TARGET_MAX_ANCHOR_OFFSET #define TARGET_MAX_ANCHOR_OFFSET 4095 +/* vec_perm support. */ + +#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK +#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \ + aarch64_vectorize_vec_perm_const_ok + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-aarch64.h" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 7a1cdc8..9ea5e0c 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -228,6 +228,8 @@ UNSPEC_FMAX ; Used in aarch64-simd.md. UNSPEC_FMIN ; Used in aarch64-simd.md. UNSPEC_BSL ; Used in aarch64-simd.md. + UNSPEC_TBL ; Used in vector permute patterns. + UNSPEC_CONCAT ; Used in vector permute patterns. ]) ;; ------------------------------------------------------------------- @@ -415,8 +417,9 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI") (V4HI "V4HI") (V8HI "V8HI") (V2SI "V2SI") (V4SI "V4SI") + (DI "DI") (V2DI "V2DI") (V2SF "V2SI") (V4SF "V4SI") - (DI "DI") (V2DI "V2DI")]) + (V2DF "V2DI")]) ;; Vm for lane instructions is restricted to FP_LO_REGS. (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x") diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 5935346..bce98d0 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3014,6 +3014,7 @@ proc check_effective_target_vect_perm { } { } else { set et_vect_perm_saved 0 if { [is-effective-target arm_neon_ok] + || [istarget aarch64*-*-*] || [istarget powerpc*-*-*] || [istarget spu-*-*] || [istarget i?86-*-*] @@ -3040,6 +3041,7 @@ proc check_effective_target_vect_perm_byte { } { } else { set et_vect_perm_byte_saved 0 if { [is-effective-target arm_neon_ok] + || [istarget aarch64*-*-*] || [istarget powerpc*-*-*] || [istarget spu-*-*] } { set et_vect_perm_byte_saved 1 @@ -3062,6 +3064,7 @@ proc check_effective_target_vect_perm_short { } { } else { set et_vect_perm_short_saved 0 if { [is-effective-target arm_neon_ok] + || [istarget aarch64*-*-*] || [istarget powerpc*-*-*] || [istarget spu-*-*] } { set et_vect_perm_short_saved 1 @@ -3697,7 +3700,8 @@ proc check_effective_target_vect_char_mult { } { verbose "check_effective_target_vect_char_mult: using cached result" 2 } else { set et_vect_char_mult_saved 0 - if { [istarget ia64-*-*] + if { [istarget aarch64*-*-*] + || [istarget ia64-*-*] || [istarget i?86-*-*] || [istarget x86_64-*-*] || [check_effective_target_arm32] } { @@ -3768,8 +3772,9 @@ proc check_effective_target_vect_extract_even_odd { } { verbose "check_effective_target_vect_extract_even_odd: using cached result" 2 } else { set et_vect_extract_even_odd_saved 0 - if { [istarget powerpc*-*-*] - || [is-effective-target arm_neon_ok] + if { [istarget aarch64*-*-*] + || [istarget powerpc*-*-*] + || [is-effective-target arm_neon_ok] || [istarget i?86-*-*] || [istarget x86_64-*-*] || [istarget ia64-*-*] @@ -3793,8 +3798,9 @@ proc check_effective_target_vect_interleave { } { verbose "check_effective_target_vect_interleave: using cached result" 2 } else { set et_vect_interleave_saved 0 - if { [istarget powerpc*-*-*] - || [is-effective-target arm_neon_ok] + if { [istarget aarch64*-*-*] + || [istarget powerpc*-*-*] + || [is-effective-target arm_neon_ok] || [istarget i?86-*-*] || [istarget x86_64-*-*] || [istarget ia64-*-*]