Hi All, Says what it does on the tin. In case some operations form in RTL due to a split, combine or any RTL pass then still try to recognize them. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd.md: Add new peepholes. * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Schedule sequential PLUS operations next to each other to increase the chance of forming pairwise operations. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 93a2888f567460ad10ec050ea7d4f701df4729d1..20e9adbf7b9b484f9a19f0c62770930dc3941eb2 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3425,6 +3425,22 @@ (define_insn "aarch64_faddp" [(set_attr "type" "neon_fp_reduc_add_")] ) +(define_peephole2 + [(set (match_operand: 0 "register_operand") + (vec_select: + (match_operand:VHSDF 1 "register_operand") + (parallel [(match_operand 2 "const_int_operand")]))) + (set (match_operand: 3 "register_operand") + (plus: + (match_dup 0) + (match_operand: 5 "register_operand")))] + "TARGET_SIMD + && ENDIAN_LANE_N (, INTVAL (operands[2])) == 1 + && REGNO (operands[5]) == REGNO (operands[1]) + && peep2_reg_dead_p (2, operands[0])" + [(set (match_dup 3) (unspec: [(match_dup 1)] UNSPEC_FADDV))] +) + (define_insn "reduc_plus_scal_" [(set (match_operand: 0 "register_operand" "=w") (unspec: [(match_operand:VDQV 1 "register_operand" "w")] diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f3bd71c9f10868f9e6ab50d8e36ed3ee3d48ac22..4023b1729d92bf37f5a2fc8fc8cd3a5194532079 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25372,6 +25372,29 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } + /* Try to schedule vec_select and add together so the peephole works. */ + if (simple_sets_p && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)) + && GET_CODE (SET_SRC (prev_set)) == VEC_SELECT && GET_CODE (SET_SRC (curr_set)) == PLUS) + { + /* We're trying to match: + prev (vec_select) == (set (reg r0) + (vec_select (reg r1) n) + curr (plus) == (set (reg r2) + (plus (reg r0) (reg r1))) */ + rtx prev_src = SET_SRC (prev_set); + rtx curr_src = SET_SRC (curr_set); + rtx parallel = XEXP (prev_src, 1); + auto idx + = ENDIAN_LANE_N (GET_MODE_NUNITS (GET_MODE (XEXP (prev_src, 0))), 1); + if (GET_CODE (parallel) == PARALLEL + && XVECLEN (parallel, 0) == 1 + && known_eq (INTVAL (XVECEXP (parallel, 0, 0)), idx) + && GET_MODE (SET_DEST (prev_set)) == GET_MODE (curr_src) + && GET_MODE_INNER (GET_MODE (XEXP (prev_src, 0))) + == GET_MODE (XEXP (curr_src, 1))) + return true; + } + /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */ if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH) && prev_set && curr_set && any_condjump_p (curr) --