diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -71,6 +71,7 @@ ;; ---- [INT] Reciprocal approximation ;; ---- [INT<-FP] Base-2 logarithm ;; ---- [INT] Polynomial multiplication +;; ---- [INT] Misc optab implementations ;; ;; == Permutation ;; ---- [INT,FP] General permutes @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_" "\t%0., %1., %2." ) +;; ------------------------------------------------------------------------- +;; ---- [INT] Misc optab implementations +;; ------------------------------------------------------------------------- +;; Includes: +;; - aarch64_bitmask_udiv +;; ------------------------------------------------------------------------- + +;; div optimizations using narrowings +;; we can do the division e.g. shorts by 255 faster by calculating it as +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in +;; double the precision of x. +;; +;; See aarch64-simd.md for bigger explanation. +(define_expand "@aarch64_bitmask_udiv3" + [(match_operand:SVE_FULL_HSDI 0 "register_operand") + (match_operand:SVE_FULL_HSDI 1 "register_operand") + (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] + "TARGET_SVE2" +{ + unsigned HOST_WIDE_INT size + = (1ULL << GET_MODE_UNIT_BITSIZE (mode)) - 1; + if (!CONST_VECTOR_P (operands[2]) + || const_vector_encoded_nelts (operands[2]) != 1 + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) + FAIL; + + rtx addend = gen_reg_rtx (mode); + rtx tmp1 = gen_reg_rtx (mode); + rtx tmp2 = gen_reg_rtx (mode); + rtx val = aarch64_simd_gen_const_vector_dup (mode, 1); + emit_move_insn (addend, lowpart_subreg (mode, val, mode)); + emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, mode, tmp1, operands[1], + addend)); + emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, mode, tmp2, operands[1], + lowpart_subreg (mode, tmp1, + mode))); + emit_move_insn (operands[0], + lowpart_subreg (mode, tmp2, mode)); + DONE; +}) + ;; ========================================================================= ;; == Permutation ;; ========================================================================= diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c new file mode 100644 index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O2 -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include + +/* +** draw_bitmap1: +** ... +** mul z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h +** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h +** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h +** ... +*/ +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xfe; +} + +/* +** draw_bitmap3: +** ... +** mul z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s +** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s +** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s +** ... +*/ +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +/* +** draw_bitmap4: +** ... +** mul z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d +** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d +** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d +** ... +*/ +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +}