public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-4027] AArch64: Add SVE2 implementation for pow2 bitmask division
@ 2022-11-14 17:46 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2022-11-14 17:46 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:a89ac9011e04cf8ebdf856b679bd91000ef70175

commit r13-4027-ga89ac9011e04cf8ebdf856b679bd91000ef70175
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Mon Nov 14 15:53:42 2022 +0000

    AArch64: Add SVE2 implementation for pow2 bitmask division
    
    In plenty of image and video processing code it's common to modify pixel values
    by a widening operation and then scale them back into range by dividing by 255.
    
    This patch adds an named function to allow us to emit an optimized sequence
    when doing an unsigned division that is equivalent to:
    
       x = y / (2 ^ (bitsize (y)/2)-1)
    
    For SVE2 this means we generate for:
    
    void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
    {
      for (int i = 0; i < (n & -16); i+=1)
        pixel[i] = (pixel[i] * level) / 0xff;
    }
    
    the following:
    
            mov     z3.b, #1
    .L3:
            ld1b    z0.h, p0/z, [x0, x3]
            mul     z0.h, p1/m, z0.h, z2.h
            addhnb  z1.b, z0.h, z3.h
            addhnb  z0.b, z0.h, z1.h
            st1b    z0.h, p0, [x0, x3]
            inch    x3
            whilelo p0.h, w3, w2
            b.any   .L3
    
    instead of:
    
    .L3:
            ld1b    z0.h, p1/z, [x0, x3]
            mul     z0.h, p0/m, z0.h, z1.h
            umulh   z0.h, p0/m, z0.h, z2.h
            lsr     z0.h, z0.h, #7
            st1b    z0.h, p1, [x0, x3]
            inch    x3
            whilelo p1.h, w3, w2
            b.any   .L3
    
    Which results in significantly faster code.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv<mode>3): New.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve2.md                 | 41 +++++++++++++++++
 .../gcc.target/aarch64/sve2/div-by-bitmask_1.c     | 53 ++++++++++++++++++++++
 2 files changed, 94 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 5df38e3f951..dadd046b832 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,46 @@
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  rtx elt = unwrap_const_vec_duplicate (operands[2]);
+  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 00000000000..e6f5098c30f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-11-14 17:46 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-14 17:46 [gcc r13-4027] AArch64: Add SVE2 implementation for pow2 bitmask division Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).