public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-4026] AArch64: Add implementation for pow2 bitmask division.
@ 2022-11-14 17:46 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2022-11-14 17:46 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:c98aabc1427a4d2a25a2176c89dc709148a04707

commit r13-4026-gc98aabc1427a4d2a25a2176c89dc709148a04707
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Mon Nov 14 15:52:26 2022 +0000

    AArch64: Add implementation for pow2 bitmask division.
    
    This adds an implementation for the new optab for unsigned pow2 bitmask for
    AArch64.
    
    The implementation rewrites:
    
       x = y / (2 ^ (sizeof (y)/2)-1
    
    into e.g. (for bytes)
    
       (x + ((x + 257) >> 8)) >> 8
    
    where it's required that the additions be done in double the precision of x
    such that we don't lose any bits during an overflow.
    
    Essentially the sequence decomposes the division into doing two smaller
    divisions, one for the top and bottom parts of the number and adding the results
    back together.
    
    To account for the fact that shift by 8 would be division by 256 we add 1 to
    both parts of x such that when 255 we still get 1 as the answer.
    
    Because the amount we shift are half the original datatype we can use the
    halfing instructions the ISA provides to do the operation instead of using
    actual shifts.
    
    For AArch64 this means we generate for:
    
    void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
    {
      for (int i = 0; i < (n & -16); i+=1)
        pixel[i] = (pixel[i] * level) / 0xff;
    }
    
    the following:
    
            movi    v3.16b, 0x1
            umull2  v1.8h, v0.16b, v2.16b
            umull   v0.8h, v0.8b, v2.8b
            addhn   v5.8b, v1.8h, v3.8h
            addhn   v4.8b, v0.8h, v3.8h
            uaddw   v1.8h, v1.8h, v5.8b
            uaddw   v0.8h, v0.8h, v4.8b
            uzp2    v0.16b, v0.16b, v1.16b
    
    instead of:
    
            umull   v2.8h, v1.8b, v5.8b
            umull2  v1.8h, v1.16b, v5.16b
            umull   v0.4s, v2.4h, v3.4h
            umull2  v2.4s, v2.8h, v3.8h
            umull   v4.4s, v1.4h, v3.4h
            umull2  v1.4s, v1.8h, v3.8h
            uzp2    v0.8h, v0.8h, v2.8h
            uzp2    v1.8h, v4.8h, v1.8h
            shrn    v0.8b, v0.8h, 7
            shrn2   v0.16b, v1.8h, 7
    
    Which results in significantly faster code.
    
    Thanks for Wilco for the concept.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): New.
            * config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant): New.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/div-by-bitmask.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                | 57 +++++++++++++++++++++
 gcc/config/aarch64/aarch64.cc                     | 38 ++++++++++++++
 gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c | 61 +++++++++++++++++++++++
 3 files changed, 156 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 5386043739a..104088f67d2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4867,6 +4867,63 @@
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")
+   (match_operand:VQN 2 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
+  rtx elt = unwrap_const_vec_duplicate (operands[2]);
+  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a7f7c3c0121..c91df6f5006 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24306,6 +24306,40 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
+
+bool
+aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
+					       tree vectype, wide_int cst,
+					       rtx *output, rtx in0, rtx in1)
+{
+  if (code != TRUNC_DIV_EXPR
+      || !TYPE_UNSIGNED (vectype))
+    return false;
+
+  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
+  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
+    return false;
+
+  if (in0 == NULL_RTX && in1 == NULL_RTX)
+    {
+      wide_int val = wi::add (cst, 1);
+      int pow = wi::exact_log2 (val);
+      return pow == (int)(element_precision (vectype) / 2);
+    }
+
+  if (!VECTOR_TYPE_P (vectype))
+   return false;
+
+  gcc_assert (output);
+
+  if (!*output)
+    *output = gen_reg_rtx (TYPE_MODE (vectype));
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
@@ -27796,6 +27830,10 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
 
+#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
+  aarch64_vectorize_can_special_div_by_constant
+
 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
   aarch64_vectorize_preferred_vector_alignment
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 00000000000..2a535791ba7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-11-14 17:46 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-14 17:46 [gcc r13-4026] AArch64: Add implementation for pow2 bitmask division Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).