diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7f212bf37cd2c120dceb7efa733c9fa76226f029..26871a56d1fdb134f0ad9d828ce68a8df0272c53 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4867,62 +4867,48 @@ (define_expand "aarch64_hn2" } ) -;; div optimizations using narrowings -;; we can do the division e.g. shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in -;; double the precision of x. -;; -;; If we imagine a short as being composed of two blocks of bytes then -;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to -;; adding 1 to each sub component: -;; -;; short value of 16-bits -;; ┌──────────────┬────────────────┐ -;; │ │ │ -;; └──────────────┴────────────────┘ -;; 8-bit part1 ▲ 8-bit part2 ▲ -;; │ │ -;; │ │ -;; +1 +1 -;; -;; after the first addition, we have to shift right by 8, and narrow the -;; results back to a byte. Remember that the addition must be done in -;; double the precision of the input. Since 8 is half the size of a short -;; we can use a narrowing halfing instruction in AArch64, addhn which also -;; does the addition in a wider precision and narrows back to a byte. The -;; shift itself is implicit in the operation as it writes back only the top -;; half of the result. i.e. bits 2*esize-1:esize. -;; -;; Since we have narrowed the result of the first part back to a byte, for -;; the second addition we can use a widening addition, uaddw. -;; -;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8. -;; -;; The shift is later optimized by combine to a uzp2 with movi #0. -(define_expand "@aarch64_bitmask_udiv3" - [(match_operand:VQN 0 "register_operand") - (match_operand:VQN 1 "register_operand") - (match_operand:VQN 2 "immediate_operand")] +;; Implement add_highpart as ADD + RSHIFT, we have various optimization for +;; narrowing represented as shifts and so this representation will allow us to +;; further optimize this should the result require narrowing. The alternative +;; representation of ADDHN + UXTL is less efficient and harder to futher +;; optimize. +(define_expand "add3_highpart" + [(set (match_operand:VQN 0 "register_operand") + (unspec:VQN [(match_operand:VQN 1 "register_operand") + (match_operand:VQN 2 "register_operand")] + ADD_HIGHPART))] + "TARGET_SIMD" +{ + rtx result = gen_reg_rtx (mode); + int shift_amount = GET_MODE_UNIT_BITSIZE (mode) / 2; + rtx shift_vector = aarch64_simd_gen_const_vector_dup (mode, + shift_amount); + emit_insn (gen_add3 (result, operands[1], operands[2])); + emit_insn (gen_aarch64_simd_lshr (operands[0], result, shift_vector)); + DONE; +}) + +;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector +(define_insn_and_split "*bitmask_shift_plus" + [(set (match_operand:VQN 0 "register_operand" "=w") + (plus:VQN + (lshiftrt:VQN + (plus:VQN (match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "register_operand" "w")) + (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" "Dr")) + (match_operand:VQN 4 "register_operand" "w")))] "TARGET_SIMD" + "#" + "&& !reload_completed" + [(const_int 0)] { - unsigned HOST_WIDE_INT size - = (1ULL << GET_MODE_UNIT_BITSIZE (mode)) - 1; - rtx elt = unwrap_const_vec_duplicate (operands[2]); - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) - FAIL; - - rtx addend = gen_reg_rtx (mode); - rtx val = aarch64_simd_gen_const_vector_dup (mode, 1); - emit_move_insn (addend, lowpart_subreg (mode, val, mode)); - rtx tmp1 = gen_reg_rtx (mode); - rtx tmp2 = gen_reg_rtx (mode); - emit_insn (gen_aarch64_addhn (tmp1, operands[1], addend)); - unsigned bitsize = GET_MODE_UNIT_BITSIZE (mode); - rtx shift_vector = aarch64_simd_gen_const_vector_dup (mode, bitsize); - emit_insn (gen_aarch64_uaddw (tmp2, operands[1], tmp1)); - emit_insn (gen_aarch64_simd_lshr (operands[0], tmp2, shift_vector)); + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_aarch64_addhn (tmp, operands[1], operands[2])); + emit_insn (gen_aarch64_uaddw (operands[0], operands[4], tmp)); DONE; -}) +} + [(set_attr "type" "neon_add_halve")] +) ;; pmul. diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 40c0728a7e6f00c395c360ce7625bc2e4a018809..ad01c1ddf9257cec951ed0c16558a3c4d856813b 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2317,39 +2317,51 @@ (define_insn "@aarch64_sve_" ;; ---- [INT] Misc optab implementations ;; ------------------------------------------------------------------------- ;; Includes: -;; - aarch64_bitmask_udiv +;; - add_highpart ;; ------------------------------------------------------------------------- -;; div optimizations using narrowings -;; we can do the division e.g. shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in -;; double the precision of x. -;; -;; See aarch64-simd.md for bigger explanation. -(define_expand "@aarch64_bitmask_udiv3" - [(match_operand:SVE_FULL_HSDI 0 "register_operand") - (match_operand:SVE_FULL_HSDI 1 "register_operand") - (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] +;; Implement add_highpart as ADD + RSHIFT, we have various optimization for +;; narrowing represented as shifts and so this representation will allow us to +;; further optimize this should the result require narrowing. The alternative +;; representation of ADDHN + UXTL is less efficient and harder to futher +;; optimize. +(define_expand "add3_highpart" + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand") + (unspec:SVE_FULL_HSDI + [(match_operand:SVE_FULL_HSDI 1 "register_operand") + (match_operand:SVE_FULL_HSDI 2 "register_operand")] + ADD_HIGHPART))] "TARGET_SVE2" { - unsigned HOST_WIDE_INT size - = (1ULL << GET_MODE_UNIT_BITSIZE (mode)) - 1; - rtx elt = unwrap_const_vec_duplicate (operands[2]); - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) - FAIL; + rtx result = gen_reg_rtx (mode); + int shift_amount = GET_MODE_UNIT_BITSIZE (mode) / 2; + rtx shift_vector = aarch64_simd_gen_const_vector_dup (mode, + shift_amount); + emit_insn (gen_add3 (result, operands[1], operands[2])); + emit_insn (gen_vlshr3 (operands[0], result, shift_vector)); + DONE; +}) - rtx addend = gen_reg_rtx (mode); +;; Optimize ((a + b) >> n) where n is half the bitsize of the vector +(define_insn_and_split "*bitmask_shift_plus" + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") + (unspec:SVE_FULL_HSDI [ + (match_operand: 1 "register_operand" "Upl") + (lshiftrt:SVE_FULL_HSDI + (plus:SVE_FULL_HSDI + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w") + (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")) + (match_operand:SVE_FULL_HSDI 4 "aarch64_simd_shift_imm_vec_exact_top" "Dr")) + ] UNSPEC_PRED_X))] + "TARGET_SVE2" + "#" + "&& !reload_completed" + [(const_int 0)] +{ rtx tmp1 = gen_reg_rtx (mode); - rtx tmp2 = gen_reg_rtx (mode); - rtx val = aarch64_simd_gen_const_vector_dup (mode, 1); - emit_move_insn (addend, lowpart_subreg (mode, val, mode)); - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, mode, tmp1, operands[1], - addend)); - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, mode, tmp2, operands[1], - lowpart_subreg (mode, tmp1, - mode))); + emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, mode, tmp1, operands[2], operands[3])); emit_move_insn (operands[0], - lowpart_subreg (mode, tmp2, mode)); + lowpart_subreg (mode, tmp1, mode)); DONE; }) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..8a04feb29e6bfb423a09dde2cd64853e69d0e1ba 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -24363,46 +24363,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return ret; } - -/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ - -bool -aarch64_vectorize_can_special_div_by_constant (enum tree_code code, - tree vectype, wide_int cst, - rtx *output, rtx in0, rtx in1) -{ - if (code != TRUNC_DIV_EXPR - || !TYPE_UNSIGNED (vectype)) - return false; - - machine_mode mode = TYPE_MODE (vectype); - unsigned int flags = aarch64_classify_vector_mode (mode); - if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) - return false; - - int pow = wi::exact_log2 (cst + 1); - auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype)); - /* SVE actually has a div operator, we may have gotten here through - that route. */ - if (pow != (int) (element_precision (vectype) / 2) - || insn_code == CODE_FOR_nothing) - return false; - - /* We can use the optimized pattern. */ - if (in0 == NULL_RTX && in1 == NULL_RTX) - return true; - - gcc_assert (output); - - expand_operand ops[3]; - create_output_operand (&ops[0], *output, mode); - create_input_operand (&ops[1], in0, mode); - create_fixed_operand (&ops[2], in1); - expand_insn (insn_code, 3, ops); - *output = ops[0].value; - return true; -} - /* Generate a byte permute mask for a register of mode MODE, which has NUNITS units. */ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 6cbc97cc82c06a68259bdf4dec8a0eab230081e5..ae627ae56cbd1e8b882e596dba974e74ef396e0e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -750,6 +750,8 @@ (define_c_enum "unspec" UNSPEC_REVH ; Used in aarch64-sve.md. UNSPEC_REVW ; Used in aarch64-sve.md. UNSPEC_REVBHW ; Used in aarch64-sve.md. + UNSPEC_SADD_HIGHPART ; Used in aarch64-sve.md. + UNSPEC_UADD_HIGHPART ; Used in aarch64-sve.md. UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md. UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md. UNSPEC_FMLA ; Used in aarch64-sve.md. @@ -2704,6 +2706,7 @@ (define_int_iterator UNPACK [UNSPEC_UNPACKSHI UNSPEC_UNPACKUHI (define_int_iterator UNPACK_UNSIGNED [UNSPEC_UNPACKULO UNSPEC_UNPACKUHI]) +(define_int_iterator ADD_HIGHPART [UNSPEC_SADD_HIGHPART UNSPEC_UADD_HIGHPART]) (define_int_iterator MUL_HIGHPART [UNSPEC_SMUL_HIGHPART UNSPEC_UMUL_HIGHPART]) (define_int_iterator CLAST [UNSPEC_CLASTA UNSPEC_CLASTB]) @@ -3342,6 +3345,8 @@ (define_int_attr su [(UNSPEC_SADDV "s") (UNSPEC_UNPACKUHI "u") (UNSPEC_UNPACKSLO "s") (UNSPEC_UNPACKULO "u") + (UNSPEC_SADD_HIGHPART "s") + (UNSPEC_UADD_HIGHPART "u") (UNSPEC_SMUL_HIGHPART "s") (UNSPEC_UMUL_HIGHPART "u") (UNSPEC_COND_FCVTZS "s")