Hi All, In plenty of image and video processing code it's common to modify pixel values by a widening operation and then scale them back into range by dividing by 255. This patch adds an named function to allow us to emit an optimized sequence when doing an unsigned division that is equivalent to: x = y / (2 ^ (bitsize (y)/2)-1) For SVE2 this means we generate for: void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { for (int i = 0; i < (n & -16); i+=1) pixel[i] = (pixel[i] * level) / 0xff; } the following: mov z3.b, #1 .L3: ld1b z0.h, p0/z, [x0, x3] mul z0.h, p1/m, z0.h, z2.h addhnb z1.b, z0.h, z3.h addhnb z0.b, z0.h, z1.h st1b z0.h, p0, [x0, x3] inch x3 whilelo p0.h, w3, w2 b.any .L3 instead of: .L3: ld1b z0.h, p1/z, [x0, x3] mul z0.h, p0/m, z0.h, z1.h umulh z0.h, p0/m, z0.h, z2.h lsr z0.h, z0.h, #7 st1b z0.h, p1, [x0, x3] inch x3 whilelo p1.h, w3, w2 b.any .L3 Which results in significantly faster code. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv3): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -71,6 +71,7 @@ ;; ---- [INT] Reciprocal approximation ;; ---- [INT<-FP] Base-2 logarithm ;; ---- [INT] Polynomial multiplication +;; ---- [INT] Misc optab implementations ;; ;; == Permutation ;; ---- [INT,FP] General permutes @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_" "\t%0., %1., %2." ) +;; ------------------------------------------------------------------------- +;; ---- [INT] Misc optab implementations +;; ------------------------------------------------------------------------- +;; Includes: +;; - aarch64_bitmask_udiv +;; ------------------------------------------------------------------------- + +;; div optimizations using narrowings +;; we can do the division e.g. shorts by 255 faster by calculating it as +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in +;; double the precision of x. +;; +;; See aarch64-simd.md for bigger explanation. +(define_expand "@aarch64_bitmask_udiv3" + [(match_operand:SVE_FULL_HSDI 0 "register_operand") + (match_operand:SVE_FULL_HSDI 1 "register_operand") + (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] + "TARGET_SVE2" +{ + unsigned HOST_WIDE_INT size + = (1ULL << GET_MODE_UNIT_BITSIZE (mode)) - 1; + if (!CONST_VECTOR_P (operands[2]) + || const_vector_encoded_nelts (operands[2]) != 1 + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) + FAIL; + + rtx addend = gen_reg_rtx (mode); + rtx tmp1 = gen_reg_rtx (mode); + rtx tmp2 = gen_reg_rtx (mode); + rtx val = aarch64_simd_gen_const_vector_dup (mode, 1); + emit_move_insn (addend, lowpart_subreg (mode, val, mode)); + emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, mode, tmp1, operands[1], + addend)); + emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, mode, tmp2, operands[1], + lowpart_subreg (mode, tmp1, + mode))); + emit_move_insn (operands[0], + lowpart_subreg (mode, tmp2, mode)); + DONE; +}) + ;; ========================================================================= ;; == Permutation ;; ========================================================================= diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c new file mode 100644 index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O2 -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include + +/* +** draw_bitmap1: +** ... +** mul z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h +** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h +** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h +** ... +*/ +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xfe; +} + +/* +** draw_bitmap3: +** ... +** mul z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s +** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s +** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s +** ... +*/ +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +/* +** draw_bitmap4: +** ... +** mul z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d +** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d +** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d +** ... +*/ +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} --