From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1922) id 79EEF386C5A2; Sun, 12 Jun 2022 03:25:38 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 79EEF386C5A2 MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Prathamesh Kulkarni To: gcc-cvs@gcc.gnu.org Subject: [gcc r13-1055] PR96463: Optimise svld1rq from vectors for little endian AArch64 targets. X-Act-Checkin: gcc X-Git-Author: Prathamesh Kulkarni X-Git-Refname: refs/heads/master X-Git-Oldrev: cbd842717ec5cab989141bf1575846c2acef818d X-Git-Newrev: 494bec025002df422f2faa947138bf3643d80b54 Message-Id: <20220612032538.79EEF386C5A2@sourceware.org> Date: Sun, 12 Jun 2022 03:25:38 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 12 Jun 2022 03:25:38 -0000 https://gcc.gnu.org/g:494bec025002df422f2faa947138bf3643d80b54 commit r13-1055-g494bec025002df422f2faa947138bf3643d80b54 Author: Prathamesh Kulkarni Date: Sun Jun 12 08:50:16 2022 +0530 PR96463: Optimise svld1rq from vectors for little endian AArch64 targets. The patch folds: lhs = svld1rq({-1, -1, ...}, rhs) into: tmp = mem_ref [(elem_type * {ref-all}) rhs] lhs = vec_perm_expr. which is then expanded using aarch64_expand_sve_dupq. Example: svint32_t foo (int32x4_t x) { return svld1rq (svptrue_b8 (), &x[0]); } code-gen: foo: .LFB4350: dup z0.q, z0.q[0] ret The patch relaxes type-checking for VEC_PERM_EXPR by allowing different vector types for lhs and rhs provided: (1) rhs3 is constant and has integer type element. (2) len(lhs) == len(rhs3) and len(rhs1) == len(rhs2) (3) lhs and rhs have same element type. gcc/ChangeLog: PR target/96463 * config/aarch64/aarch64-sve-builtins-base.cc: Include ssa.h. (svld1rq_impl::fold): Define. * config/aarch64/aarch64.cc (expand_vec_perm_d): Define new members op_mode and op_vec_flags. (aarch64_evpc_reencode): Initialize newd.op_mode and newd.op_vec_flags. (aarch64_evpc_sve_dup): New function. (aarch64_expand_vec_perm_const_1): Gate existing calls to aarch64_evpc_* functions under d->vmode == d->op_mode, and call aarch64_evpc_sve_dup. (aarch64_vectorize_vec_perm_const): Remove assert d->vmode != d->op_mode, and initialize d.op_mode and d.op_vec_flags. * tree-cfg.cc (verify_gimple_assign_ternary): Allow different vector types for lhs and rhs in VEC_PERM_EXPR if rhs3 is constant. gcc/testsuite/ChangeLog: PR target/96463 * gcc.target/aarch64/sve/acle/general/pr96463-1.c: New test. * gcc.target/aarch64/sve/acle/general/pr96463-2.c: Likewise. Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 59 ++++++++++++++ gcc/config/aarch64/aarch64.cc | 95 ++++++++++++++++------ .../aarch64/sve/acle/general/pr96463-1.c | 29 +++++++ .../aarch64/sve/acle/general/pr96463-2.c | 29 +++++++ gcc/tree-cfg.cc | 40 ++++++--- 5 files changed, 212 insertions(+), 40 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index bee410929bd..82f9eba5c39 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -44,6 +44,7 @@ #include "aarch64-sve-builtins-shapes.h" #include "aarch64-sve-builtins-base.h" #include "aarch64-sve-builtins-functions.h" +#include "ssa.h" using namespace aarch64_sve; @@ -1207,6 +1208,64 @@ public: insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); return e.use_contiguous_load_insn (icode); } + + gimple * + fold (gimple_folder &f) const override + { + tree arg0 = gimple_call_arg (f.call, 0); + tree arg1 = gimple_call_arg (f.call, 1); + + /* Transform: + lhs = svld1rq ({-1, -1, ... }, arg1) + into: + tmp = mem_ref [(elem * {ref-all}) arg1] + lhs = vec_perm_expr. + on little endian target. + vectype is the corresponding ADVSIMD type. */ + + if (!BYTES_BIG_ENDIAN + && integer_all_onesp (arg0)) + { + tree lhs = gimple_call_lhs (f.call); + tree lhs_type = TREE_TYPE (lhs); + poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type); + tree eltype = TREE_TYPE (lhs_type); + + scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type)); + machine_mode vq_mode = aarch64_vq_mode (elmode).require (); + tree vectype = build_vector_type_for_mode (eltype, vq_mode); + + tree elt_ptr_type + = build_pointer_type_for_mode (eltype, VOIDmode, true); + tree zero = build_zero_cst (elt_ptr_type); + + /* Use element type alignment. */ + tree access_type + = build_aligned_type (vectype, TYPE_ALIGN (eltype)); + + tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0); + tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero); + gimple *mem_ref_stmt + = gimple_build_assign (mem_ref_lhs, mem_ref_op); + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); + + int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant (); + vec_perm_builder sel (lhs_len, source_nelts, 1); + for (int i = 0; i < source_nelts; i++) + sel.quick_push (i); + + vec_perm_indices indices (sel, 1, source_nelts); + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), + TYPE_MODE (access_type), + indices)); + tree mask_type = build_vector_type (ssizetype, lhs_len); + tree mask = vec_perm_indices_to_tree (mask_type, indices); + return gimple_build_assign (lhs, VEC_PERM_EXPR, + mem_ref_lhs, mem_ref_lhs, mask); + } + + return NULL; + } }; class svld1ro_impl : public load_replicate diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 5969d1f56c2..d21e041eccb 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -23342,7 +23342,9 @@ struct expand_vec_perm_d rtx target, op0, op1; vec_perm_indices perm; machine_mode vmode; + machine_mode op_mode; unsigned int vec_flags; + unsigned int op_vec_flags; bool one_vector_p; bool testing_p; }; @@ -23577,6 +23579,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d) newd.vmode = new_mode; newd.vec_flags = VEC_ADVSIMD; + newd.op_mode = newd.vmode; + newd.op_vec_flags = newd.vec_flags; newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL; newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL; newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL; @@ -23891,6 +23895,33 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) return true; } +/* Try to implement D using SVE dup instruction. */ + +static bool +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d) +{ + if (BYTES_BIG_ENDIAN + || !d->one_vector_p + || d->vec_flags != VEC_SVE_DATA + || d->op_vec_flags != VEC_ADVSIMD + || d->perm.encoding ().nelts_per_pattern () != 1 + || !known_eq (d->perm.encoding ().npatterns (), + GET_MODE_NUNITS (d->op_mode)) + || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128)) + return false; + + int npatterns = d->perm.encoding ().npatterns (); + for (int i = 0; i < npatterns; i++) + if (!known_eq (d->perm[i], i)) + return false; + + if (d->testing_p) + return true; + + aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0); + return true; +} + /* Try to implement D using SVE SEL instruction. */ static bool @@ -24014,6 +24045,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d) static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) { + gcc_assert (d->op_mode != E_VOIDmode); + /* The pattern matching functions above are written to look for a small number to begin the sequence (0, 1, N/2). If we begin with an index from the second operand, we can swap the operands. */ @@ -24030,30 +24063,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) || d->vec_flags == VEC_SVE_PRED) && known_gt (nelt, 1)) { - if (aarch64_evpc_rev_local (d)) - return true; - else if (aarch64_evpc_rev_global (d)) - return true; - else if (aarch64_evpc_ext (d)) - return true; - else if (aarch64_evpc_dup (d)) - return true; - else if (aarch64_evpc_zip (d)) - return true; - else if (aarch64_evpc_uzp (d)) - return true; - else if (aarch64_evpc_trn (d)) - return true; - else if (aarch64_evpc_sel (d)) - return true; - else if (aarch64_evpc_ins (d)) - return true; - else if (aarch64_evpc_reencode (d)) - return true; - if (d->vec_flags == VEC_SVE_DATA) - return aarch64_evpc_sve_tbl (d); - else if (d->vec_flags == VEC_ADVSIMD) - return aarch64_evpc_tbl (d); + if (d->vmode == d->op_mode) + { + if (aarch64_evpc_rev_local (d)) + return true; + else if (aarch64_evpc_rev_global (d)) + return true; + else if (aarch64_evpc_ext (d)) + return true; + else if (aarch64_evpc_dup (d)) + return true; + else if (aarch64_evpc_zip (d)) + return true; + else if (aarch64_evpc_uzp (d)) + return true; + else if (aarch64_evpc_trn (d)) + return true; + else if (aarch64_evpc_sel (d)) + return true; + else if (aarch64_evpc_ins (d)) + return true; + else if (aarch64_evpc_reencode (d)) + return true; + + if (d->vec_flags == VEC_SVE_DATA) + return aarch64_evpc_sve_tbl (d); + else if (d->vec_flags == VEC_ADVSIMD) + return aarch64_evpc_tbl (d); + } + else + { + if (aarch64_evpc_sve_dup (d)) + return true; + } } return false; } @@ -24065,9 +24107,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target, rtx op0, rtx op1, const vec_perm_indices &sel) { - if (vmode != op_mode) - return false; - struct expand_vec_perm_d d; /* Check whether the mask can be applied to a single vector. */ @@ -24091,6 +24130,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, sel.nelts_per_input ()); d.vmode = vmode; d.vec_flags = aarch64_classify_vector_mode (d.vmode); + d.op_mode = op_mode; + d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode); d.target = target; d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX; if (op0 == op1) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c new file mode 100644 index 00000000000..b68f43cc3ba --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#include "arm_neon.h" +#include "arm_sve.h" + +#define TEST(ret_type, param_type, suffix) \ +ret_type test_##suffix(param_type x) \ +{ \ + return svld1rq_##suffix (svptrue_b8 (), &x[0]); \ +} + +TEST(svint8_t, int8x16_t, s8) +TEST(svint16_t, int16x8_t, s16) +TEST(svint32_t, int32x4_t, s32) +TEST(svint64_t, int64x2_t, s64) + +TEST(svuint8_t, uint8x16_t, u8) +TEST(svuint16_t, uint16x8_t, u16) +TEST(svuint32_t, uint32x4_t, u32) +TEST(svuint64_t, uint64x2_t, u64) + +TEST(svfloat16_t, float16x8_t, f16) +TEST(svfloat32_t, float32x4_t, f32) +TEST(svfloat64_t, float64x2_t, f64) + +TEST(svbfloat16_t, bfloat16x8_t, bf16) + +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c new file mode 100644 index 00000000000..196de3f5e0a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#include "arm_neon.h" +#include "arm_sve.h" + +#define TEST(ret_type, param_type, suffix) \ +ret_type test_##suffix(param_type *x) \ +{ \ + return svld1rq_##suffix (svptrue_b8 (), &x[0]); \ +} + +TEST(svint8_t, int8_t, s8) +TEST(svint16_t, int16_t, s16) +TEST(svint32_t, int32_t, s32) +TEST(svint64_t, int64_t, s64) + +TEST(svuint8_t, uint8_t, u8) +TEST(svuint16_t, uint16_t, u16) +TEST(svuint32_t, uint32_t, u32) +TEST(svuint64_t, uint64_t, u64) + +TEST(svfloat16_t, float16_t, f16) +TEST(svfloat32_t, float32_t, f32) +TEST(svfloat64_t, float64_t, f64) + +TEST(svbfloat16_t, bfloat16_t, bf16) + +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */ diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc index 8de1b144a42..9e5d84a9805 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -4297,18 +4297,14 @@ verify_gimple_assign_ternary (gassign *stmt) break; case VEC_PERM_EXPR: - if (!useless_type_conversion_p (lhs_type, rhs1_type) - || !useless_type_conversion_p (lhs_type, rhs2_type)) - { - error ("type mismatch in %qs", code_name); - debug_generic_expr (lhs_type); - debug_generic_expr (rhs1_type); - debug_generic_expr (rhs2_type); - debug_generic_expr (rhs3_type); - return true; - } + /* If permute is constant, then we allow for lhs and rhs + to have different vector types, provided: + (1) lhs, rhs1, rhs2 have same element type. + (2) rhs3 vector is constant and has integer element type. + (3) len(lhs) == len(rhs3) && len(rhs1) == len(rhs2). */ - if (TREE_CODE (rhs1_type) != VECTOR_TYPE + if (TREE_CODE (lhs_type) != VECTOR_TYPE + || TREE_CODE (rhs1_type) != VECTOR_TYPE || TREE_CODE (rhs2_type) != VECTOR_TYPE || TREE_CODE (rhs3_type) != VECTOR_TYPE) { @@ -4320,10 +4316,28 @@ verify_gimple_assign_ternary (gassign *stmt) return true; } + /* If rhs3 is constant, we allow lhs, rhs1 and rhs2 to be different vector types, + as long as lhs, rhs1 and rhs2 have same element type. */ + if (TREE_CONSTANT (rhs3) + ? (!useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs1_type)) + || !useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs2_type))) + : (!useless_type_conversion_p (lhs_type, rhs1_type) + || !useless_type_conversion_p (lhs_type, rhs2_type))) + { + error ("type mismatch in %qs", code_name); + debug_generic_expr (lhs_type); + debug_generic_expr (rhs1_type); + debug_generic_expr (rhs2_type); + debug_generic_expr (rhs3_type); + return true; + } + + /* If rhs3 is constant, relax the check len(rhs2) == len(rhs3). */ if (maybe_ne (TYPE_VECTOR_SUBPARTS (rhs1_type), TYPE_VECTOR_SUBPARTS (rhs2_type)) - || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type), - TYPE_VECTOR_SUBPARTS (rhs3_type)) + || (!TREE_CONSTANT(rhs3) + && maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type), + TYPE_VECTOR_SUBPARTS (rhs3_type))) || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs3_type), TYPE_VECTOR_SUBPARTS (lhs_type))) {